ase 11.8, V11.8.85 // Based on NVVM 7.0.1 // .version 7.8 .target sm_86 .address_size 64 // .globl fmha_v2_flash_attention_fp16_64_16_S_256_sm86_kernel_nl .extern .shared .align 16 .b8 _ZN25fused_multihead_attention5smem_E[]; .visible .entry fmha_v2_flash_attention_fp16_64_16_S_256_sm86_kernel_nl( .param .align 8 .b8 fmha_v2_flash_attention_fp16_64_16_S_256_sm86_kernel_nl_param_0[168] ) { .reg .pred %p<129>; .reg .b16 %rs<75>; .reg .f32 %f<1298>; .reg .b32 %r<2922>; .reg .b64 %rd<219>; mov.b64 %rd45, fmha_v2_flash_attention_fp16_64_16_S_256_sm86_kernel_nl_param_0; mov.u64 %rd1, %rd45; ld.param.u32 %r1, [fmha_v2_flash_attention_fp16_64_16_S_256_sm86_kernel_nl_param_0+56]; ld.param.u32 %r2, [fmha_v2_flash_attention_fp16_64_16_S_256_sm86_kernel_nl_param_0+52]; mov.u32 %r841, %ctaid.z; shl.b32 %r3, %r841, 6; setp.le.s32 %p5, %r1, %r3; @%p5 bra $L__BB0_112; mov.u32 %r878, %tid.x; mov.u32 %r879, %ctaid.y; mov.u32 %r880, %ctaid.x; mul.lo.s32 %r881, %r1, %r879; mad.lo.s32 %r882, %r881, %r2, %r880; shr.s32 %r883, %r878, 31; shr.u32 %r884, %r883, 27; add.s32 %r885, %r878, %r884; and.b32 %r886, %r885, -32; sub.s32 %r4, %r878, %r886; shr.u32 %r887, %r883, 25; add.s32 %r888, %r878, %r887; shr.s32 %r889, %r888, 7; shl.b32 %r890, %r889, 4; shr.s32 %r891, %r4, 31; shr.u32 %r892, %r891, 30; add.s32 %r893, %r4, %r892; and.b32 %r894, %r893, 2147483644; sub.s32 %r895, %r4, %r894; shl.b32 %r896, %r895, 1; add.s32 %r2731, %r896, %r890; ld.param.u64 %rd2, [%rd1+16]; ld.param.u64 %rd3, [%rd1+40]; shr.s32 %r6, %r885, 5; shr.s32 %r897, %r885, 31; shr.u32 %r898, %r897, 30; add.s32 %r899, %r6, %r898; and.b32 %r900, %r899, 268435452; sub.s32 %r901, %r6, %r900; shl.b32 %r902, %r901, 4; shr.s32 %r903, %r893, 2; add.s32 %r7, %r902, %r903; add.s32 %r904, %r6, %r3; cvt.s64.s32 %rd4, %r904; ld.param.u64 %rd5, [%rd1+24]; mul.lo.s64 %rd62, %rd5, %rd4; mul.lo.s32 %r905, %r882, 3; mul.wide.s32 %rd63, %r905, 512; shl.b32 %r906, %r4, 4; cvt.s64.s32 %rd64, %r906; add.s64 %rd65, %rd63, %rd64; add.s64 %rd66, %rd65, %rd62; ld.param.u64 %rd67, [%rd1]; add.s64 %rd46, %rd67, %rd66; shr.u32 %r907, %r897, 29; add.s32 %r908, %r6, %r907; and.b32 %r909, %r908, 268435448; sub.s32 %r910, %r6, %r909; xor.b32 %r911, %r910, %r4; shl.b32 %r912, %r6, 9; shl.b32 %r913, %r911, 4; mov.u32 %r914, 31; mov.u32 %r2548, 0; mov.u32 %r915, -1; shfl.sync.idx.b32 %r8|%p1, %r2548, %r2548, %r914, %r915; shfl.sync.idx.b32 %r916|%p6, %r2548, %r2548, %r914, %r915; and.b32 %r917, %r878, 96; shr.u32 %r918, %r917, 1; and.b32 %r919, %r878, 15; or.b32 %r920, %r918, %r919; and.b32 %r921, %r878, 7; shl.b32 %r922, %r878, 4; and.b32 %r923, %r922, 112; and.b32 %r924, %r878, 16; xor.b32 %r925, %r923, %r924; cvt.s64.s32 %rd68, %r6; mul.lo.s64 %rd69, %rd5, %rd68; add.s32 %r926, %r905, 1; mul.wide.s32 %rd70, %r926, 512; add.s64 %rd71, %rd70, %rd64; add.s64 %rd72, %rd71, %rd69; cvta.to.global.u64 %rd73, %rd67; add.s64 %rd218, %rd73, %rd72; shfl.sync.idx.b32 %r9|%p2, %r2548, %r2548, %r914, %r915; shfl.sync.idx.b32 %r10|%p3, %r2548, %r2548, %r914, %r915; shr.u32 %r927, %r924, 1; or.b32 %r928, %r927, %r921; and.b32 %r929, %r878, 8; shr.u32 %r930, %r929, 3; xor.b32 %r931, %r930, %r921; add.s32 %r932, %r905, 2; mul.wide.s32 %rd74, %r932, 512; add.s64 %rd75, %rd74, %rd64; add.s64 %rd76, %rd75, %rd69; add.s64 %rd217, %rd73, %rd76; shfl.sync.idx.b32 %r933|%p7, %r2548, %r2548, %r914, %r915; shfl.sync.idx.b32 %r11|%p4, %r2548, %r2548, %r914, %r915; ld.param.u64 %rd9, [%rd1+32]; ld.param.u64 %rd10, [%rd1+8]; sub.s32 %r934, %r1, %r3; min.s32 %r935, %r934, 64; shl.b32 %r939, %r931, 4; shl.b32 %r940, %r928, 9; shl.b32 %r941, %r920, 9; setp.lt.s32 %p8, %r6, %r935; add.s32 %r14, %r6, 4; setp.lt.s32 %p9, %r14, %r935; add.s32 %r15, %r6, 8; setp.lt.s32 %p10, %r15, %r935; add.s32 %r16, %r6, 12; setp.lt.s32 %p11, %r16, %r935; add.s32 %r942, %r6, 16; setp.lt.s32 %p12, %r942, %r935; add.s32 %r943, %r6, 20; setp.lt.s32 %p13, %r943, %r935; add.s32 %r944, %r6, 24; setp.lt.s32 %p14, %r944, %r935; add.s32 %r945, %r6, 28; setp.lt.s32 %p15, %r945, %r935; add.s32 %r946, %r6, 32; setp.lt.s32 %p16, %r946, %r935; add.s32 %r947, %r6, 36; setp.lt.s32 %p17, %r947, %r935; add.s32 %r948, %r6, 40; setp.lt.s32 %p18, %r948, %r935; add.s32 %r949, %r6, 44; setp.lt.s32 %p19, %r949, %r935; add.s32 %r950, %r6, 48; setp.lt.s32 %p20, %r950, %r935; add.s32 %r951, %r6, 52; setp.lt.s32 %p21, %r951, %r935; add.s32 %r952, %r6, 56; setp.lt.s32 %p22, %r952, %r935; add.s32 %r953, %r6, 60; setp.lt.s32 %p23, %r953, %r935; add.s32 %r17, %r913, %r912; or.b32 %r18, %r941, %r925; or.b32 %r19, %r940, %r939; shl.b64 %rd77, %rd5, 2; selp.b32 %r853, 16, 0, %p13; selp.b32 %r855, 16, 0, %p14; selp.b32 %r857, 16, 0, %p15; selp.b32 %r859, 16, 0, %p16; selp.b32 %r861, 16, 0, %p17; selp.b32 %r863, 16, 0, %p18; selp.b32 %r865, 16, 0, %p19; selp.b32 %r867, 16, 0, %p20; selp.b32 %r869, 16, 0, %p21; mov.u32 %r954, _ZN25fused_multihead_attention5smem_E; add.s32 %r955, %r916, %r954; add.s32 %r842, %r955, %r17; add.s32 %r956, %r17, 2048; xor.b32 %r20, %r956, 64; add.s32 %r844, %r955, %r20; add.s32 %r846, %r842, 4096; add.s32 %r957, %r17, 6144; xor.b32 %r21, %r957, 64; add.s32 %r848, %r955, %r21; add.s32 %r850, %r842, 8192; add.s32 %r958, %r17, 10240; xor.b32 %r959, %r958, 64; add.s32 %r852, %r955, %r959; add.s32 %r854, %r842, 12288; add.s32 %r960, %r17, 14336; xor.b32 %r961, %r960, 64; add.s32 %r856, %r955, %r961; add.s32 %r858, %r842, 16384; add.s32 %r962, %r17, 18432; xor.b32 %r963, %r962, 64; add.s32 %r860, %r955, %r963; add.s32 %r862, %r842, 20480; add.s32 %r964, %r17, 22528; xor.b32 %r965, %r964, 64; add.s32 %r864, %r955, %r965; add.s32 %r866, %r842, 24576; add.s32 %r966, %r17, 26624; xor.b32 %r967, %r966, 64; add.s32 %r868, %r955, %r967; add.s32 %r870, %r842, 28672; add.s32 %r968, %r17, 30720; xor.b32 %r969, %r968, 64; add.s32 %r872, %r955, %r969; selp.b32 %r843, 16, 0, %p8; // begin inline asm cp.async.cg.shared.global [%r842], [%rd46], 16, %r843; // end inline asm selp.b32 %r845, 16, 0, %p9; add.s64 %rd47, %rd46, %rd77; // begin inline asm cp.async.cg.shared.global [%r844], [%rd47], 16, %r845; // end inline asm selp.b32 %r847, 16, 0, %p10; add.s64 %rd48, %rd47, %rd77; // begin inline asm cp.async.cg.shared.global [%r846], [%rd48], 16, %r847; // end inline asm selp.b32 %r849, 16, 0, %p11; add.s64 %rd49, %rd48, %rd77; // begin inline asm cp.async.cg.shared.global [%r848], [%rd49], 16, %r849; // end inline asm selp.b32 %r851, 16, 0, %p12; add.s64 %rd50, %rd49, %rd77; // begin inline asm cp.async.cg.shared.global [%r850], [%rd50], 16, %r851; // end inline asm add.s64 %rd51, %rd50, %rd77; // begin inline asm cp.async.cg.shared.global [%r852], [%rd51], 16, %r853; // end inline asm add.s64 %rd52, %rd51, %rd77; // begin inline asm cp.async.cg.shared.global [%r854], [%rd52], 16, %r855; // end inline asm add.s64 %rd53, %rd52, %rd77; // begin inline asm cp.async.cg.shared.global [%r856], [%rd53], 16, %r857; // end inline asm add.s64 %rd54, %rd53, %rd77; // begin inline asm cp.async.cg.shared.global [%r858], [%rd54], 16, %r859; // end inline asm add.s64 %rd55, %rd54, %rd77; // begin inline asm cp.async.cg.shared.global [%r860], [%rd55], 16, %r861; // end inline asm add.s64 %rd56, %rd55, %rd77; // begin inline asm cp.async.cg.shared.global [%r862], [%rd56], 16, %r863; // end inline asm add.s64 %rd57, %rd56, %rd77; // begin inline asm cp.async.cg.shared.global [%r864], [%rd57], 16, %r865; // end inline asm add.s64 %rd58, %rd57, %rd77; // begin inline asm cp.async.cg.shared.global [%r866], [%rd58], 16, %r867; // end inline asm add.s64 %rd59, %rd58, %rd77; // begin inline asm cp.async.cg.shared.global [%r868], [%rd59], 16, %r869; // end inline asm selp.b32 %r871, 16, 0, %p22; add.s64 %rd60, %rd59, %rd77; // begin inline asm cp.async.cg.shared.global [%r870], [%rd60], 16, %r871; // end inline asm selp.b32 %r873, 16, 0, %p23; add.s64 %rd61, %rd60, %rd77; // begin inline asm cp.async.cg.shared.global [%r872], [%rd61], 16, %r873; // end inline asm min.s32 %r22, %r1, 16; setp.ge.s32 %p24, %r6, %r22; mov.u32 %r2544, %r2548; mov.u32 %r2545, %r2548; mov.u32 %r2546, %r2548; mov.u32 %r2547, %r2548; @%p24 bra $L__BB0_3; ld.global.v4.u32 {%r2544, %r2545, %r2546, %r2547}, [%rd218]; $L__BB0_3: add.s64 %rd12, %rd218, %rd77; setp.ge.s32 %p25, %r14, %r22; mov.u32 %r2549, %r2548; mov.u32 %r2550, %r2548; mov.u32 %r2551, %r2548; @%p25 bra $L__BB0_5; ld.global.v4.u32 {%r2548, %r2549, %r2550, %r2551}, [%rd12]; $L__BB0_5: add.s64 %rd13, %rd12, %rd77; setp.ge.s32 %p26, %r15, %r22; mov.u32 %r2556, 0; mov.u32 %r2552, %r2556; mov.u32 %r2553, %r2556; mov.u32 %r2554, %r2556; mov.u32 %r2555, %r2556; @%p26 bra $L__BB0_7; ld.global.v4.u32 {%r2552, %r2553, %r2554, %r2555}, [%rd13]; $L__BB0_7: setp.ge.s32 %p27, %r16, %r22; add.s64 %rd14, %rd13, %rd77; mov.u32 %r2557, %r2556; mov.u32 %r2558, %r2556; mov.u32 %r2559, %r2556; @%p27 bra $L__BB0_9; ld.global.v4.u32 {%r2556, %r2557, %r2558, %r2559}, [%rd14]; $L__BB0_9: mov.u32 %r2770, 0; mov.u32 %r2766, %r2770; mov.u32 %r2767, %r2770; mov.u32 %r2768, %r2770; mov.u32 %r2769, %r2770; @%p24 bra $L__BB0_11; ld.global.v4.u32 {%r2766, %r2767, %r2768, %r2769}, [%rd217]; $L__BB0_11: add.s64 %rd16, %rd217, %rd77; mov.u32 %r2771, %r2770; mov.u32 %r2772, %r2770; mov.u32 %r2773, %r2770; @%p25 bra $L__BB0_13; ld.global.v4.u32 {%r2770, %r2771, %r2772, %r2773}, [%rd16]; $L__BB0_13: add.s64 %rd17, %rd16, %rd77; mov.u32 %r2778, 0; mov.u32 %r2774, %r2778; mov.u32 %r2775, %r2778; mov.u32 %r2776, %r2778; mov.u32 %r2777, %r2778; @%p26 bra $L__BB0_15; ld.global.v4.u32 {%r2774, %r2775, %r2776, %r2777}, [%rd17]; $L__BB0_15: add.s64 %rd18, %rd17, %rd77; mov.u32 %r2779, %r2778; mov.u32 %r2780, %r2778; mov.u32 %r2781, %r2778; @%p27 bra $L__BB0_17; ld.global.v4.u32 {%r2778, %r2779, %r2780, %r2781}, [%rd18]; $L__BB0_17: // begin inline asm cp.async.commit_group; // end inline asm add.s32 %r1209, %r954, 32768; add.s32 %r1210, %r10, %r1209; add.s32 %r1030, %r1210, %r17; add.s32 %r1035, %r1210, %r20; add.s32 %r1211, %r17, 4096; add.s32 %r1040, %r1210, %r1211; add.s32 %r1045, %r1210, %r21; // begin inline asm st.shared.v4.b32 [%r1030], {%r2544, %r2545, %r2546, %r2547}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1035], {%r2548, %r2549, %r2550, %r2551}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1040], {%r2552, %r2553, %r2554, %r2555}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1045], {%r2556, %r2557, %r2558, %r2559}; // end inline asm add.s32 %r1212, %r11, %r954; add.s32 %r1213, %r1212, 40960; add.s32 %r1050, %r1213, %r17; add.s32 %r1055, %r1213, %r20; add.s32 %r1060, %r1213, %r1211; add.s32 %r1065, %r1213, %r21; // begin inline asm st.shared.v4.b32 [%r1050], {%r2766, %r2767, %r2768, %r2769}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1055], {%r2770, %r2771, %r2772, %r2773}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1060], {%r2774, %r2775, %r2776, %r2777}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1065], {%r2778, %r2779, %r2780, %r2781}; // end inline asm // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r1214, %r18, %r954; add.s32 %r1074, %r1214, %r8; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2708, %r2709, %r2710, %r2711}, [%r1074]; // end inline asm add.s32 %r1215, %r19, %r1209; add.s32 %r1079, %r1215, %r9; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2576, %r2577, %r2706, %r2707}, [%r1079]; // end inline asm ld.param.f32 %f1, [%rd1+64]; // begin inline asm mov.u32 %r2921, 0; // end inline asm // begin inline asm mov.u32 %r2920, 0; // end inline asm // begin inline asm mov.u32 %r2919, 0; // end inline asm // begin inline asm mov.u32 %r2918, 0; // end inline asm // begin inline asm mov.u32 %r2917, 0; // end inline asm // begin inline asm mov.u32 %r2916, 0; // end inline asm // begin inline asm mov.u32 %r2915, 0; // end inline asm // begin inline asm mov.u32 %r2914, 0; // end inline asm // begin inline asm mov.u32 %r2913, 0; // end inline asm // begin inline asm mov.u32 %r2912, 0; // end inline asm // begin inline asm mov.u32 %r2911, 0; // end inline asm // begin inline asm mov.u32 %r2910, 0; // end inline asm // begin inline asm mov.u32 %r2909, 0; // end inline asm // begin inline asm mov.u32 %r2908, 0; // end inline asm // begin inline asm mov.u32 %r2907, 0; // end inline asm // begin inline asm mov.u32 %r2906, 0; // end inline asm // begin inline asm mov.u32 %r2905, 0; // end inline asm // begin inline asm mov.u32 %r2904, 0; // end inline asm // begin inline asm mov.u32 %r2903, 0; // end inline asm // begin inline asm mov.u32 %r2902, 0; // end inline asm // begin inline asm mov.u32 %r2901, 0; // end inline asm // begin inline asm mov.u32 %r2900, 0; // end inline asm // begin inline asm mov.u32 %r2899, 0; // end inline asm // begin inline asm mov.u32 %r2898, 0; // end inline asm // begin inline asm mov.u32 %r2897, 0; // end inline asm // begin inline asm mov.u32 %r2896, 0; // end inline asm // begin inline asm mov.u32 %r2895, 0; // end inline asm // begin inline asm mov.u32 %r2894, 0; // end inline asm // begin inline asm mov.u32 %r2893, 0; // end inline asm // begin inline asm mov.u32 %r2892, 0; // end inline asm // begin inline asm mov.u32 %r2891, 0; // end inline asm // begin inline asm mov.u32 %r2890, 0; // end inline asm // begin inline asm mov.u32 %r2889, 0; // end inline asm // begin inline asm mov.u32 %r2888, 0; // end inline asm // begin inline asm mov.u32 %r2887, 0; // end inline asm // begin inline asm mov.u32 %r2886, 0; // end inline asm // begin inline asm mov.u32 %r2885, 0; // end inline asm // begin inline asm mov.u32 %r2884, 0; // end inline asm // begin inline asm mov.u32 %r2883, 0; // end inline asm // begin inline asm mov.u32 %r2882, 0; // end inline asm // begin inline asm mov.u32 %r2881, 0; // end inline asm // begin inline asm mov.u32 %r2880, 0; // end inline asm // begin inline asm mov.u32 %r2879, 0; // end inline asm // begin inline asm mov.u32 %r2878, 0; // end inline asm // begin inline asm mov.u32 %r2877, 0; // end inline asm // begin inline asm mov.u32 %r2876, 0; // end inline asm // begin inline asm mov.u32 %r2875, 0; // end inline asm // begin inline asm mov.u32 %r2874, 0; // end inline asm // begin inline asm mov.u32 %r2873, 0; // end inline asm // begin inline asm mov.u32 %r2872, 0; // end inline asm // begin inline asm mov.u32 %r2871, 0; // end inline asm // begin inline asm mov.u32 %r2870, 0; // end inline asm // begin inline asm mov.u32 %r2869, 0; // end inline asm // begin inline asm mov.u32 %r2868, 0; // end inline asm // begin inline asm mov.u32 %r2867, 0; // end inline asm // begin inline asm mov.u32 %r2866, 0; // end inline asm // begin inline asm mov.u32 %r2865, 0; // end inline asm // begin inline asm mov.u32 %r2864, 0; // end inline asm // begin inline asm mov.u32 %r2863, 0; // end inline asm // begin inline asm mov.u32 %r2862, 0; // end inline asm // begin inline asm mov.u32 %r2861, 0; // end inline asm // begin inline asm mov.u32 %r2860, 0; // end inline asm // begin inline asm mov.u32 %r2859, 0; // end inline asm // begin inline asm mov.u32 %r2858, 0; // end inline asm // begin inline asm mov.u32 %r2857, 0; // end inline asm // begin inline asm mov.u32 %r2856, 0; // end inline asm // begin inline asm mov.u32 %r2855, 0; // end inline asm // begin inline asm mov.u32 %r2854, 0; // end inline asm // begin inline asm mov.u32 %r2853, 0; // end inline asm // begin inline asm mov.u32 %r2852, 0; // end inline asm // begin inline asm mov.u32 %r2851, 0; // end inline asm // begin inline asm mov.u32 %r2850, 0; // end inline asm // begin inline asm mov.u32 %r2849, 0; // end inline asm // begin inline asm mov.u32 %r2848, 0; // end inline asm // begin inline asm mov.u32 %r2847, 0; // end inline asm // begin inline asm mov.u32 %r2846, 0; // end inline asm // begin inline asm mov.u32 %r2845, 0; // end inline asm // begin inline asm mov.u32 %r2844, 0; // end inline asm // begin inline asm mov.u32 %r2843, 0; // end inline asm // begin inline asm mov.u32 %r2842, 0; // end inline asm // begin inline asm mov.u32 %r2841, 0; // end inline asm // begin inline asm mov.u32 %r2840, 0; // end inline asm // begin inline asm mov.u32 %r2839, 0; // end inline asm // begin inline asm mov.u32 %r2838, 0; // end inline asm // begin inline asm mov.u32 %r2837, 0; // end inline asm // begin inline asm mov.u32 %r2836, 0; // end inline asm // begin inline asm mov.u32 %r2835, 0; // end inline asm // begin inline asm mov.u32 %r2834, 0; // end inline asm // begin inline asm mov.u32 %r2833, 0; // end inline asm // begin inline asm mov.u32 %r2832, 0; // end inline asm // begin inline asm mov.u32 %r2831, 0; // end inline asm // begin inline asm mov.u32 %r2830, 0; // end inline asm // begin inline asm mov.u32 %r2829, 0; // end inline asm // begin inline asm mov.u32 %r2828, 0; // end inline asm // begin inline asm mov.u32 %r2827, 0; // end inline asm // begin inline asm mov.u32 %r2826, 0; // end inline asm // begin inline asm mov.u32 %r2825, 0; // end inline asm // begin inline asm mov.u32 %r2824, 0; // end inline asm // begin inline asm mov.u32 %r2823, 0; // end inline asm // begin inline asm mov.u32 %r2822, 0; // end inline asm // begin inline asm mov.u32 %r2821, 0; // end inline asm // begin inline asm mov.u32 %r2820, 0; // end inline asm // begin inline asm mov.u32 %r2819, 0; // end inline asm // begin inline asm mov.u32 %r2818, 0; // end inline asm // begin inline asm mov.u32 %r2817, 0; // end inline asm // begin inline asm mov.u32 %r2816, 0; // end inline asm // begin inline asm mov.u32 %r2815, 0; // end inline asm // begin inline asm mov.u32 %r2814, 0; // end inline asm // begin inline asm mov.u32 %r2813, 0; // end inline asm // begin inline asm mov.u32 %r2812, 0; // end inline asm // begin inline asm mov.u32 %r2811, 0; // end inline asm // begin inline asm mov.u32 %r2810, 0; // end inline asm // begin inline asm mov.u32 %r2809, 0; // end inline asm // begin inline asm mov.u32 %r2808, 0; // end inline asm // begin inline asm mov.u32 %r2807, 0; // end inline asm // begin inline asm mov.u32 %r2806, 0; // end inline asm // begin inline asm mov.u32 %r2805, 0; // end inline asm // begin inline asm mov.u32 %r2804, 0; // end inline asm // begin inline asm mov.u32 %r2803, 0; // end inline asm // begin inline asm mov.u32 %r2802, 0; // end inline asm // begin inline asm mov.u32 %r2801, 0; // end inline asm // begin inline asm mov.u32 %r2800, 0; // end inline asm // begin inline asm mov.u32 %r2799, 0; // end inline asm // begin inline asm mov.u32 %r2798, 0; // end inline asm // begin inline asm mov.u32 %r2797, 0; // end inline asm // begin inline asm mov.u32 %r2796, 0; // end inline asm // begin inline asm mov.u32 %r2795, 0; // end inline asm // begin inline asm mov.u32 %r2794, 0; // end inline asm add.s32 %r1216, %r1, 15; shr.s32 %r1217, %r1216, 31; shr.u32 %r1218, %r1217, 28; add.s32 %r1219, %r1216, %r1218; and.b32 %r233, %r1219, -16; setp.lt.s32 %p32, %r1, 1; @%p32 bra $L__BB0_65; xor.b32 %r2733, %r19, 32; xor.b32 %r2732, %r18, 32; ld.param.u8 %rs1, [%rd1+160]; add.s32 %r244, %r8, %r954; add.s32 %r1222, %r9, %r954; add.s32 %r245, %r1222, 32768; cvt.s64.s32 %rd19, %r7; cvt.s64.s32 %rd20, %r2731; add.s32 %r1223, %r2731, 1; cvt.s64.s32 %rd21, %r1223; add.s32 %r1224, %r2731, 8; cvt.s64.s32 %rd22, %r1224; add.s32 %r1225, %r2731, 9; cvt.s64.s32 %rd23, %r1225; add.s32 %r246, %r7, 8; mov.u32 %r2730, 0; mov.f32 %f1286, 0fFF800000; mov.f32 %f1284, 0f00000000; mov.f32 %f1285, %f1284; mov.f32 %f1287, %f1286; mov.u32 %r2782, %r1; mov.u32 %r2783, %r1; $L__BB0_19: setp.eq.s16 %p33, %rs1, 0; @%p33 bra $L__BB0_36; cvt.s64.s32 %rd78, %r3; add.s64 %rd79, %rd19, %rd78; cvt.s64.s32 %rd80, %r1; setp.ge.u64 %p34, %rd79, %rd80; mul.lo.s32 %r1228, %r1, %r3; cvt.s64.s32 %rd81, %r1228; cvt.u64.u32 %rd26, %r2730; add.s64 %rd82, %rd81, %rd26; add.s64 %rd27, %rd82, %rd20; mul.lo.s64 %rd83, %rd80, %rd19; add.s64 %rd84, %rd27, %rd83; add.s64 %rd28, %rd20, %rd26; setp.ge.u64 %p35, %rd28, %rd80; shl.b64 %rd85, %rd84, 1; mad.lo.s32 %r1231, %r2, %r879, %r880; cvt.s64.s32 %rd86, %r1231; mul.lo.s64 %rd87, %rd3, %rd86; add.s64 %rd88, %rd87, %rd85; cvta.to.global.u64 %rd89, %rd2; add.s64 %rd29, %rd89, %rd88; mov.u16 %rs68, 0; or.pred %p36, %p35, %p34; mov.u16 %rs67, %rs68; @%p36 bra $L__BB0_22; ld.global.u16 %rs67, [%rd29]; $L__BB0_22: add.s64 %rd30, %rd21, %rd26; setp.ge.u64 %p38, %rd30, %rd80; or.pred %p39, %p38, %p34; @%p39 bra $L__BB0_24; ld.global.u16 %rs68, [%rd29+2]; $L__BB0_24: add.s64 %rd31, %rd22, %rd26; setp.ge.u64 %p41, %rd31, %rd80; mov.u16 %rs70, 0; or.pred %p42, %p41, %p34; mov.u16 %rs69, %rs70; @%p42 bra $L__BB0_26; ld.global.u16 %rs69, [%rd29+16]; $L__BB0_26: add.s64 %rd32, %rd23, %rd26; setp.ge.u64 %p44, %rd32, %rd80; or.pred %p45, %p44, %p34; @%p45 bra $L__BB0_28; ld.global.u16 %rs70, [%rd29+18]; $L__BB0_28: cvt.s64.s32 %rd101, %r246; add.s64 %rd102, %rd101, %rd78; setp.ge.u64 %p47, %rd102, %rd80; mul.wide.s32 %rd103, %r1, %r246; add.s64 %rd104, %rd27, %rd103; shl.b64 %rd105, %rd104, 1; add.s64 %rd108, %rd87, %rd105; add.s64 %rd33, %rd89, %rd108; mov.u16 %rs72, 0; or.pred %p48, %p35, %p47; mov.u16 %rs71, %rs72; @%p48 bra $L__BB0_30; ld.global.u16 %rs71, [%rd33]; $L__BB0_30: or.pred %p51, %p38, %p47; @%p51 bra $L__BB0_32; ld.global.u16 %rs72, [%rd33+2]; $L__BB0_32: mov.u16 %rs74, 0; or.pred %p54, %p41, %p47; mov.u16 %rs73, %rs74; @%p54 bra $L__BB0_34; ld.global.u16 %rs73, [%rd33+16]; $L__BB0_34: or.pred %p57, %p44, %p47; @%p57 bra $L__BB0_36; ld.global.u16 %rs74, [%rd33+18]; $L__BB0_36: // begin inline asm mov.u32 %r1249, 0; // end inline asm // begin inline asm mov.u32 %r1250, 0; // end inline asm // begin inline asm mov.u32 %r1251, 0; // end inline asm // begin inline asm mov.u32 %r1252, 0; // end inline asm // begin inline asm mov.u32 %r1253, 0; // end inline asm // begin inline asm mov.u32 %r1254, 0; // end inline asm // begin inline asm mov.u32 %r1255, 0; // end inline asm // begin inline asm mov.u32 %r1256, 0; // end inline asm add.s32 %r1261, %r244, %r2732; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1257, %r1258, %r1259, %r1260}, [%r1261]; // end inline asm add.s32 %r1266, %r245, %r2733; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1262, %r1263, %r1264, %r1265}, [%r1266]; // end inline asm mov.b32 %f198, %r1249; mov.b32 %f199, %r1250; mov.b32 %f200, %r1251; mov.b32 %f201, %r1252; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r2708, %r2709, %r2710, %r2711}, {%r2576, %r2577}, {%f198, %f199, %f200, %f201}; // end inline asm mov.b32 %f206, %r1253; mov.b32 %f207, %r1254; mov.b32 %f208, %r1255; mov.b32 %f209, %r1256; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r2708, %r2709, %r2710, %r2711}, {%r2706, %r2707}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1599, %r2732, 96; add.s32 %r1283, %r244, %r1599; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1279, %r1280, %r1281, %r1282}, [%r1283]; // end inline asm xor.b32 %r1600, %r2733, 96; add.s32 %r1288, %r245, %r1600; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1284, %r1285, %r1286, %r1287}, [%r1288]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1257, %r1258, %r1259, %r1260}, {%r1262, %r1263}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1257, %r1258, %r1259, %r1260}, {%r1264, %r1265}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1601, %r2732, 64; add.s32 %r1305, %r244, %r1601; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1301, %r1302, %r1303, %r1304}, [%r1305]; // end inline asm xor.b32 %r1602, %r2733, 64; add.s32 %r1310, %r245, %r1602; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1306, %r1307, %r1308, %r1309}, [%r1310]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1279, %r1280, %r1281, %r1282}, {%r1284, %r1285}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1279, %r1280, %r1281, %r1282}, {%r1286, %r1287}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1603, %r2732, 160; add.s32 %r1327, %r244, %r1603; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1323, %r1324, %r1325, %r1326}, [%r1327]; // end inline asm xor.b32 %r1604, %r2733, 160; add.s32 %r1332, %r245, %r1604; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1328, %r1329, %r1330, %r1331}, [%r1332]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1301, %r1302, %r1303, %r1304}, {%r1306, %r1307}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1301, %r1302, %r1303, %r1304}, {%r1308, %r1309}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1605, %r2732, 128; add.s32 %r1349, %r244, %r1605; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1345, %r1346, %r1347, %r1348}, [%r1349]; // end inline asm xor.b32 %r1606, %r2733, 128; add.s32 %r1354, %r245, %r1606; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1350, %r1351, %r1352, %r1353}, [%r1354]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1323, %r1324, %r1325, %r1326}, {%r1328, %r1329}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1323, %r1324, %r1325, %r1326}, {%r1330, %r1331}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1607, %r2732, 224; add.s32 %r1371, %r244, %r1607; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1367, %r1368, %r1369, %r1370}, [%r1371]; // end inline asm xor.b32 %r1608, %r2733, 224; add.s32 %r1376, %r245, %r1608; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1372, %r1373, %r1374, %r1375}, [%r1376]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1345, %r1346, %r1347, %r1348}, {%r1350, %r1351}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1345, %r1346, %r1347, %r1348}, {%r1352, %r1353}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1609, %r2732, 192; add.s32 %r1393, %r244, %r1609; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1389, %r1390, %r1391, %r1392}, [%r1393]; // end inline asm xor.b32 %r1610, %r2733, 192; add.s32 %r1398, %r245, %r1610; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1394, %r1395, %r1396, %r1397}, [%r1398]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1367, %r1368, %r1369, %r1370}, {%r1372, %r1373}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1367, %r1368, %r1369, %r1370}, {%r1374, %r1375}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1611, %r2732, 288; add.s32 %r1415, %r244, %r1611; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1411, %r1412, %r1413, %r1414}, [%r1415]; // end inline asm xor.b32 %r1612, %r2733, 288; add.s32 %r1420, %r245, %r1612; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1416, %r1417, %r1418, %r1419}, [%r1420]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1389, %r1390, %r1391, %r1392}, {%r1394, %r1395}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1389, %r1390, %r1391, %r1392}, {%r1396, %r1397}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1613, %r2732, 256; add.s32 %r1437, %r244, %r1613; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1433, %r1434, %r1435, %r1436}, [%r1437]; // end inline asm xor.b32 %r1614, %r2733, 256; add.s32 %r1442, %r245, %r1614; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1438, %r1439, %r1440, %r1441}, [%r1442]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1411, %r1412, %r1413, %r1414}, {%r1416, %r1417}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1411, %r1412, %r1413, %r1414}, {%r1418, %r1419}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1615, %r2732, 352; add.s32 %r1459, %r244, %r1615; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1455, %r1456, %r1457, %r1458}, [%r1459]; // end inline asm xor.b32 %r1616, %r2733, 352; add.s32 %r1464, %r245, %r1616; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1460, %r1461, %r1462, %r1463}, [%r1464]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1433, %r1434, %r1435, %r1436}, {%r1438, %r1439}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1433, %r1434, %r1435, %r1436}, {%r1440, %r1441}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1617, %r2732, 320; add.s32 %r1481, %r244, %r1617; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1477, %r1478, %r1479, %r1480}, [%r1481]; // end inline asm xor.b32 %r1618, %r2733, 320; add.s32 %r1486, %r245, %r1618; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1482, %r1483, %r1484, %r1485}, [%r1486]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1455, %r1456, %r1457, %r1458}, {%r1460, %r1461}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1455, %r1456, %r1457, %r1458}, {%r1462, %r1463}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1619, %r2732, 416; add.s32 %r1503, %r244, %r1619; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1499, %r1500, %r1501, %r1502}, [%r1503]; // end inline asm xor.b32 %r1620, %r2733, 416; add.s32 %r1508, %r245, %r1620; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1504, %r1505, %r1506, %r1507}, [%r1508]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1477, %r1478, %r1479, %r1480}, {%r1482, %r1483}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1477, %r1478, %r1479, %r1480}, {%r1484, %r1485}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1621, %r2732, 384; add.s32 %r1525, %r244, %r1621; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1521, %r1522, %r1523, %r1524}, [%r1525]; // end inline asm xor.b32 %r1622, %r2733, 384; add.s32 %r1530, %r245, %r1622; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1526, %r1527, %r1528, %r1529}, [%r1530]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1499, %r1500, %r1501, %r1502}, {%r1504, %r1505}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1499, %r1500, %r1501, %r1502}, {%r1506, %r1507}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1623, %r2732, 480; add.s32 %r1547, %r244, %r1623; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2708, %r2709, %r2710, %r2711}, [%r1547]; // end inline asm xor.b32 %r1624, %r2733, 480; add.s32 %r1552, %r245, %r1624; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2576, %r2577, %r2706, %r2707}, [%r1552]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1521, %r1522, %r1523, %r1524}, {%r1526, %r1527}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1521, %r1522, %r1523, %r1524}, {%r1528, %r1529}, {%f206, %f207, %f208, %f209}; // end inline asm xor.b32 %r1625, %r2732, 448; add.s32 %r1569, %r244, %r1625; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1565, %r1566, %r1567, %r1568}, [%r1569]; // end inline asm xor.b32 %r1626, %r2733, 448; add.s32 %r1574, %r245, %r1626; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1570, %r1571, %r1572, %r1573}, [%r1574]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r2708, %r2709, %r2710, %r2711}, {%r2576, %r2577}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r2708, %r2709, %r2710, %r2711}, {%r2706, %r2707}, {%f206, %f207, %f208, %f209}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r1565, %r1566, %r1567, %r1568}, {%r1570, %r1571}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r1565, %r1566, %r1567, %r1568}, {%r1572, %r1573}, {%f206, %f207, %f208, %f209}; // end inline asm mul.ftz.f32 %f438, %f1, %f198; mul.ftz.f32 %f439, %f1, %f199; mul.ftz.f32 %f440, %f1, %f206; mul.ftz.f32 %f441, %f1, %f207; mul.ftz.f32 %f442, %f1, %f200; mul.ftz.f32 %f443, %f1, %f201; mul.ftz.f32 %f444, %f1, %f208; mul.ftz.f32 %f445, %f1, %f209; setp.lt.s32 %p58, %r2731, %r1; selp.f32 %f1293, %f438, 0fFF800000, %p58; add.s32 %r1627, %r2731, 1; setp.lt.s32 %p59, %r1627, %r1; selp.f32 %f1292, %f439, 0fFF800000, %p59; add.s32 %r1628, %r2731, 8; setp.lt.s32 %p60, %r1628, %r1; selp.f32 %f1291, %f440, 0fFF800000, %p60; add.s32 %r1629, %r2731, 9; setp.lt.s32 %p61, %r1629, %r1; selp.f32 %f1290, %f441, 0fFF800000, %p61; selp.f32 %f1289, %f442, 0fFF800000, %p58; selp.f32 %f1288, %f443, 0fFF800000, %p59; selp.f32 %f1294, %f444, 0fFF800000, %p60; selp.f32 %f1295, %f445, 0fFF800000, %p61; @%p33 bra $L__BB0_38; // begin inline asm cvt.f32.f16 %f446, %rs67; // end inline asm add.ftz.f32 %f1293, %f446, %f1293; // begin inline asm cvt.f32.f16 %f447, %rs68; // end inline asm add.ftz.f32 %f1292, %f447, %f1292; // begin inline asm cvt.f32.f16 %f448, %rs69; // end inline asm add.ftz.f32 %f1291, %f448, %f1291; // begin inline asm cvt.f32.f16 %f449, %rs70; // end inline asm add.ftz.f32 %f1290, %f449, %f1290; // begin inline asm cvt.f32.f16 %f450, %rs71; // end inline asm add.ftz.f32 %f1289, %f450, %f1289; // begin inline asm cvt.f32.f16 %f451, %rs72; // end inline asm add.ftz.f32 %f1288, %f451, %f1288; // begin inline asm cvt.f32.f16 %f452, %rs73; // end inline asm add.ftz.f32 %f1294, %f452, %f1294; // begin inline asm cvt.f32.f16 %f453, %rs74; // end inline asm add.ftz.f32 %f1295, %f453, %f1295; $L__BB0_38: setp.gt.ftz.f32 %p63, %f1293, %f1292; selp.f32 %f454, %f1293, %f1292, %p63; setp.gt.ftz.f32 %p64, %f454, %f1291; selp.f32 %f455, %f454, %f1291, %p64; setp.gt.ftz.f32 %p65, %f455, %f1290; selp.f32 %f456, %f455, %f1290, %p65; setp.gt.ftz.f32 %p66, %f1289, %f1288; selp.f32 %f457, %f1289, %f1288, %p66; setp.gt.ftz.f32 %p67, %f457, %f1294; selp.f32 %f458, %f457, %f1294, %p67; setp.gt.ftz.f32 %p68, %f458, %f1295; selp.f32 %f459, %f458, %f1295, %p68; mov.b32 %r1630, %f456; mov.u32 %r1631, 31; mov.u32 %r1632, 1; mov.u32 %r1633, -1; shfl.sync.bfly.b32 %r1634|%p69, %r1630, %r1632, %r1631, %r1633; mov.b32 %f460, %r1634; setp.gt.ftz.f32 %p70, %f456, %f460; selp.f32 %f461, %f456, %f460, %p70; mov.b32 %r1635, %f461; mov.u32 %r1636, 2; shfl.sync.bfly.b32 %r1637|%p71, %r1635, %r1636, %r1631, %r1633; mov.b32 %f462, %r1637; setp.gt.ftz.f32 %p72, %f461, %f462; selp.f32 %f463, %f461, %f462, %p72; mov.b32 %r1638, %f459; shfl.sync.bfly.b32 %r1639|%p73, %r1638, %r1632, %r1631, %r1633; mov.b32 %f464, %r1639; setp.gt.ftz.f32 %p74, %f459, %f464; selp.f32 %f465, %f459, %f464, %p74; mov.b32 %r1640, %f465; shfl.sync.bfly.b32 %r1641|%p75, %r1640, %r1636, %r1631, %r1633; mov.b32 %f466, %r1641; setp.gt.ftz.f32 %p76, %f465, %f466; selp.f32 %f467, %f465, %f466, %p76; max.ftz.f32 %f30, %f463, %f1287; max.ftz.f32 %f31, %f467, %f1286; sub.ftz.f32 %f468, %f1293, %f30; mul.ftz.f32 %f469, %f468, 0f3FB8AA3B; ex2.approx.ftz.f32 %f32, %f469; sub.ftz.f32 %f470, %f1292, %f30; mul.ftz.f32 %f471, %f470, 0f3FB8AA3B; ex2.approx.ftz.f32 %f33, %f471; sub.ftz.f32 %f472, %f1291, %f30; mul.ftz.f32 %f473, %f472, 0f3FB8AA3B; ex2.approx.ftz.f32 %f34, %f473; sub.ftz.f32 %f474, %f1290, %f30; mul.ftz.f32 %f475, %f474, 0f3FB8AA3B; ex2.approx.ftz.f32 %f35, %f475; sub.ftz.f32 %f476, %f1289, %f31; mul.ftz.f32 %f477, %f476, 0f3FB8AA3B; ex2.approx.ftz.f32 %f36, %f477; sub.ftz.f32 %f478, %f1288, %f31; mul.ftz.f32 %f479, %f478, 0f3FB8AA3B; ex2.approx.ftz.f32 %f37, %f479; sub.ftz.f32 %f480, %f1294, %f31; mul.ftz.f32 %f481, %f480, 0f3FB8AA3B; ex2.approx.ftz.f32 %f38, %f481; sub.ftz.f32 %f482, %f1295, %f31; mul.ftz.f32 %f483, %f482, 0f3FB8AA3B; ex2.approx.ftz.f32 %f39, %f483; add.ftz.f32 %f484, %f32, %f33; add.ftz.f32 %f485, %f484, 0f00000000; add.ftz.f32 %f486, %f34, %f35; add.ftz.f32 %f487, %f486, 0f00000000; add.ftz.f32 %f488, %f485, %f487; add.ftz.f32 %f489, %f36, %f37; add.ftz.f32 %f490, %f489, 0f00000000; add.ftz.f32 %f491, %f38, %f39; add.ftz.f32 %f492, %f491, 0f00000000; add.ftz.f32 %f493, %f490, %f492; mov.b32 %r1642, %f488; shfl.sync.bfly.b32 %r1643|%p77, %r1642, %r1632, %r1631, %r1633; mov.b32 %f494, %r1643; add.ftz.f32 %f495, %f488, %f494; mov.b32 %r1644, %f495; shfl.sync.bfly.b32 %r1645|%p78, %r1644, %r1636, %r1631, %r1633; mov.b32 %f496, %r1645; add.ftz.f32 %f40, %f495, %f496; mov.b32 %r1646, %f493; shfl.sync.bfly.b32 %r1647|%p79, %r1646, %r1632, %r1631, %r1633; mov.b32 %f497, %r1647; add.ftz.f32 %f498, %f493, %f497; mov.b32 %r1648, %f498; shfl.sync.bfly.b32 %r1649|%p80, %r1648, %r1636, %r1631, %r1633; mov.b32 %f499, %r1649; add.ftz.f32 %f41, %f498, %f499; add.s32 %r415, %r2730, 16; setp.ge.s32 %p81, %r415, %r233; @%p81 bra $L__BB0_56; shl.b64 %rd122, %rd5, 4; add.s64 %rd34, %rd218, %rd122; add.s32 %r2782, %r2782, -16; add.s32 %r2783, %r2783, -16; min.s32 %r418, %r2783, 16; setp.ge.s32 %p82, %r6, %r418; mul.lo.s64 %rd123, %rd5, 20; add.s64 %rd35, %rd218, %rd123; mul.lo.s64 %rd124, %rd5, 24; add.s64 %rd36, %rd218, %rd124; mul.lo.s64 %rd125, %rd5, 28; add.s64 %rd37, %rd218, %rd125; mov.u32 %r2738, 0; mov.u32 %r2734, %r2738; mov.u32 %r2735, %r2738; mov.u32 %r2736, %r2738; mov.u32 %r2737, %r2738; @%p82 bra $L__BB0_41; ld.global.v4.u32 {%r2734, %r2735, %r2736, %r2737}, [%rd34]; $L__BB0_41: setp.ge.s32 %p83, %r14, %r418; mov.u32 %r2739, %r2738; mov.u32 %r2740, %r2738; mov.u32 %r2741, %r2738; @%p83 bra $L__BB0_43; ld.global.v4.u32 {%r2738, %r2739, %r2740, %r2741}, [%rd35]; $L__BB0_43: setp.ge.s32 %p84, %r15, %r418; mov.u32 %r2746, 0; mov.u32 %r2742, %r2746; mov.u32 %r2743, %r2746; mov.u32 %r2744, %r2746; mov.u32 %r2745, %r2746; @%p84 bra $L__BB0_45; ld.global.v4.u32 {%r2742, %r2743, %r2744, %r2745}, [%rd36]; $L__BB0_45: setp.ge.s32 %p85, %r16, %r418; mov.u32 %r2747, %r2746; mov.u32 %r2748, %r2746; mov.u32 %r2749, %r2746; @%p85 bra $L__BB0_47; ld.global.v4.u32 {%r2746, %r2747, %r2748, %r2749}, [%rd37]; $L__BB0_47: add.s64 %rd38, %rd217, %rd122; min.s32 %r451, %r2782, 16; setp.ge.s32 %p86, %r6, %r451; add.s64 %rd39, %rd217, %rd123; add.s64 %rd40, %rd217, %rd124; add.s64 %rd41, %rd217, %rd125; mov.u32 %r2770, 0; mov.u32 %r2766, %r2770; mov.u32 %r2767, %r2770; mov.u32 %r2768, %r2770; mov.u32 %r2769, %r2770; @%p86 bra $L__BB0_49; ld.global.v4.u32 {%r2766, %r2767, %r2768, %r2769}, [%rd38]; $L__BB0_49: setp.ge.s32 %p87, %r14, %r451; mov.u32 %r2771, %r2770; mov.u32 %r2772, %r2770; mov.u32 %r2773, %r2770; @%p87 bra $L__BB0_51; ld.global.v4.u32 {%r2770, %r2771, %r2772, %r2773}, [%rd39]; $L__BB0_51: setp.ge.s32 %p88, %r15, %r451; mov.u32 %r2778, 0; mov.u32 %r2774, %r2778; mov.u32 %r2775, %r2778; mov.u32 %r2776, %r2778; mov.u32 %r2777, %r2778; @%p88 bra $L__BB0_53; ld.global.v4.u32 {%r2774, %r2775, %r2776, %r2777}, [%rd40]; $L__BB0_53: setp.ge.s32 %p89, %r16, %r451; mov.u32 %r2779, %r2778; mov.u32 %r2780, %r2778; mov.u32 %r2781, %r2778; @%p89 bra $L__BB0_55; ld.global.v4.u32 {%r2778, %r2779, %r2780, %r2781}, [%rd41]; $L__BB0_55: // begin inline asm cp.async.commit_group; // end inline asm bar.sync 0; // begin inline asm st.shared.v4.b32 [%r1030], {%r2734, %r2735, %r2736, %r2737}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1035], {%r2738, %r2739, %r2740, %r2741}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1040], {%r2742, %r2743, %r2744, %r2745}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1045], {%r2746, %r2747, %r2748, %r2749}; // end inline asm mov.u64 %rd217, %rd38; mov.u64 %rd218, %rd34; $L__BB0_56: xor.b32 %r2792, %r2733, 32; xor.b32 %r2793, %r2732, 32; sub.ftz.f32 %f500, %f1287, %f30; mul.ftz.f32 %f501, %f500, 0f3FB8AA3B; ex2.approx.ftz.f32 %f502, %f501; mul.ftz.f32 %f42, %f502, %f1285; add.ftz.f32 %f1285, %f42, %f40; sub.ftz.f32 %f503, %f1286, %f31; mul.ftz.f32 %f504, %f503, 0f3FB8AA3B; ex2.approx.ftz.f32 %f505, %f504; mul.ftz.f32 %f44, %f505, %f1284; add.ftz.f32 %f1284, %f44, %f41; @%p81 bra $L__BB0_58; xor.b32 %r2543, %r2733, 32; xor.b32 %r2542, %r2732, 32; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r1744, %r244, %r2542; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2708, %r2709, %r2710, %r2711}, [%r1744]; // end inline asm add.s32 %r1749, %r245, %r2543; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2576, %r2577, %r2706, %r2707}, [%r1749]; // end inline asm mov.u32 %r2792, %r2733; mov.u32 %r2793, %r2732; $L__BB0_58: add.s32 %r2539, %r2730, 16; setp.lt.s32 %p91, %r2539, %r233; @%p91 bra $L__BB0_60; bar.sync 0; $L__BB0_60: shl.b32 %r2159, %r878, 9; and.b32 %r2160, %r2159, 7680; or.b32 %r2161, %r925, %r2160; add.s32 %r2163, %r954, 40960; add.s32 %r1754, %r2161, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1750, %r1751, %r1752, %r1753}, [%r1754]; // end inline asm xor.b32 %r2164, %r2161, 32; add.s32 %r1759, %r2164, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1755, %r1756, %r1757, %r1758}, [%r1759]; // end inline asm xor.b32 %r2165, %r2161, 64; add.s32 %r1764, %r2165, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1760, %r1761, %r1762, %r1763}, [%r1764]; // end inline asm xor.b32 %r2166, %r2161, 96; add.s32 %r1769, %r2166, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1765, %r1766, %r1767, %r1768}, [%r1769]; // end inline asm or.b32 %r2167, %r2161, 128; add.s32 %r1774, %r2167, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1770, %r1771, %r1772, %r1773}, [%r1774]; // end inline asm xor.b32 %r2168, %r2161, 160; add.s32 %r1779, %r2168, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1775, %r1776, %r1777, %r1778}, [%r1779]; // end inline asm xor.b32 %r2169, %r2161, 192; add.s32 %r1784, %r2169, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1780, %r1781, %r1782, %r1783}, [%r1784]; // end inline asm xor.b32 %r2170, %r2161, 224; add.s32 %r1789, %r2170, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1785, %r1786, %r1787, %r1788}, [%r1789]; // end inline asm or.b32 %r2171, %r2161, 256; add.s32 %r1794, %r2171, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1790, %r1791, %r1792, %r1793}, [%r1794]; // end inline asm xor.b32 %r2172, %r2161, 288; add.s32 %r1799, %r2172, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1795, %r1796, %r1797, %r1798}, [%r1799]; // end inline asm xor.b32 %r2173, %r2161, 320; add.s32 %r1804, %r2173, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1800, %r1801, %r1802, %r1803}, [%r1804]; // end inline asm xor.b32 %r2174, %r2161, 352; add.s32 %r1809, %r2174, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1805, %r1806, %r1807, %r1808}, [%r1809]; // end inline asm or.b32 %r2175, %r2161, 384; add.s32 %r1814, %r2175, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1810, %r1811, %r1812, %r1813}, [%r1814]; // end inline asm xor.b32 %r2176, %r2161, 416; add.s32 %r1819, %r2176, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1815, %r1816, %r1817, %r1818}, [%r1819]; // end inline asm xor.b32 %r2177, %r2161, 448; add.s32 %r1824, %r2177, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1820, %r1821, %r1822, %r1823}, [%r1824]; // end inline asm xor.b32 %r2178, %r2161, 480; add.s32 %r1829, %r2178, %r2163; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1825, %r1826, %r1827, %r1828}, [%r1829]; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1830, %f33, %f32; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1831, %f37, %f36; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1832, %f35, %f34; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1833, %f39, %f38; // end inline asm // begin inline asm mov.u32 %r1834, 0; // end inline asm // begin inline asm mov.u32 %r1835, 0; // end inline asm // begin inline asm mov.u32 %r1836, 0; // end inline asm // begin inline asm mov.u32 %r1837, 0; // end inline asm // begin inline asm mov.u32 %r1838, 0; // end inline asm // begin inline asm mov.u32 %r1839, 0; // end inline asm // begin inline asm mov.u32 %r1840, 0; // end inline asm // begin inline asm mov.u32 %r1841, 0; // end inline asm // begin inline asm mov.u32 %r1842, 0; // end inline asm // begin inline asm mov.u32 %r1843, 0; // end inline asm // begin inline asm mov.u32 %r1844, 0; // end inline asm // begin inline asm mov.u32 %r1845, 0; // end inline asm // begin inline asm mov.u32 %r1846, 0; // end inline asm // begin inline asm mov.u32 %r1847, 0; // end inline asm // begin inline asm mov.u32 %r1848, 0; // end inline asm // begin inline asm mov.u32 %r1849, 0; // end inline asm // begin inline asm mov.u32 %r1850, 0; // end inline asm // begin inline asm mov.u32 %r1851, 0; // end inline asm // begin inline asm mov.u32 %r1852, 0; // end inline asm // begin inline asm mov.u32 %r1853, 0; // end inline asm // begin inline asm mov.u32 %r1854, 0; // end inline asm // begin inline asm mov.u32 %r1855, 0; // end inline asm // begin inline asm mov.u32 %r1856, 0; // end inline asm // begin inline asm mov.u32 %r1857, 0; // end inline asm // begin inline asm mov.u32 %r1858, 0; // end inline asm // begin inline asm mov.u32 %r1859, 0; // end inline asm // begin inline asm mov.u32 %r1860, 0; // end inline asm // begin inline asm mov.u32 %r1861, 0; // end inline asm // begin inline asm mov.u32 %r1862, 0; // end inline asm // begin inline asm mov.u32 %r1863, 0; // end inline asm // begin inline asm mov.u32 %r1864, 0; // end inline asm // begin inline asm mov.u32 %r1865, 0; // end inline asm // begin inline asm mov.u32 %r1866, 0; // end inline asm // begin inline asm mov.u32 %r1867, 0; // end inline asm // begin inline asm mov.u32 %r1868, 0; // end inline asm // begin inline asm mov.u32 %r1869, 0; // end inline asm // begin inline asm mov.u32 %r1870, 0; // end inline asm // begin inline asm mov.u32 %r1871, 0; // end inline asm // begin inline asm mov.u32 %r1872, 0; // end inline asm // begin inline asm mov.u32 %r1873, 0; // end inline asm // begin inline asm mov.u32 %r1874, 0; // end inline asm // begin inline asm mov.u32 %r1875, 0; // end inline asm // begin inline asm mov.u32 %r1876, 0; // end inline asm // begin inline asm mov.u32 %r1877, 0; // end inline asm // begin inline asm mov.u32 %r1878, 0; // end inline asm // begin inline asm mov.u32 %r1879, 0; // end inline asm // begin inline asm mov.u32 %r1880, 0; // end inline asm // begin inline asm mov.u32 %r1881, 0; // end inline asm // begin inline asm mov.u32 %r1882, 0; // end inline asm // begin inline asm mov.u32 %r1883, 0; // end inline asm // begin inline asm mov.u32 %r1884, 0; // end inline asm // begin inline asm mov.u32 %r1885, 0; // end inline asm // begin inline asm mov.u32 %r1886, 0; // end inline asm // begin inline asm mov.u32 %r1887, 0; // end inline asm // begin inline asm mov.u32 %r1888, 0; // end inline asm // begin inline asm mov.u32 %r1889, 0; // end inline asm // begin inline asm mov.u32 %r1890, 0; // end inline asm // begin inline asm mov.u32 %r1891, 0; // end inline asm // begin inline asm mov.u32 %r1892, 0; // end inline asm // begin inline asm mov.u32 %r1893, 0; // end inline asm // begin inline asm mov.u32 %r1894, 0; // end inline asm // begin inline asm mov.u32 %r1895, 0; // end inline asm // begin inline asm mov.u32 %r1896, 0; // end inline asm // begin inline asm mov.u32 %r1897, 0; // end inline asm // begin inline asm mov.u32 %r1898, 0; // end inline asm // begin inline asm mov.u32 %r1899, 0; // end inline asm // begin inline asm mov.u32 %r1900, 0; // end inline asm // begin inline asm mov.u32 %r1901, 0; // end inline asm // begin inline asm mov.u32 %r1902, 0; // end inline asm // begin inline asm mov.u32 %r1903, 0; // end inline asm // begin inline asm mov.u32 %r1904, 0; // end inline asm // begin inline asm mov.u32 %r1905, 0; // end inline asm // begin inline asm mov.u32 %r1906, 0; // end inline asm // begin inline asm mov.u32 %r1907, 0; // end inline asm // begin inline asm mov.u32 %r1908, 0; // end inline asm // begin inline asm mov.u32 %r1909, 0; // end inline asm // begin inline asm mov.u32 %r1910, 0; // end inline asm // begin inline asm mov.u32 %r1911, 0; // end inline asm // begin inline asm mov.u32 %r1912, 0; // end inline asm // begin inline asm mov.u32 %r1913, 0; // end inline asm // begin inline asm mov.u32 %r1914, 0; // end inline asm // begin inline asm mov.u32 %r1915, 0; // end inline asm // begin inline asm mov.u32 %r1916, 0; // end inline asm // begin inline asm mov.u32 %r1917, 0; // end inline asm // begin inline asm mov.u32 %r1918, 0; // end inline asm // begin inline asm mov.u32 %r1919, 0; // end inline asm // begin inline asm mov.u32 %r1920, 0; // end inline asm // begin inline asm mov.u32 %r1921, 0; // end inline asm // begin inline asm mov.u32 %r1922, 0; // end inline asm // begin inline asm mov.u32 %r1923, 0; // end inline asm // begin inline asm mov.u32 %r1924, 0; // end inline asm // begin inline asm mov.u32 %r1925, 0; // end inline asm // begin inline asm mov.u32 %r1926, 0; // end inline asm // begin inline asm mov.u32 %r1927, 0; // end inline asm // begin inline asm mov.u32 %r1928, 0; // end inline asm // begin inline asm mov.u32 %r1929, 0; // end inline asm // begin inline asm mov.u32 %r1930, 0; // end inline asm // begin inline asm mov.u32 %r1931, 0; // end inline asm // begin inline asm mov.u32 %r1932, 0; // end inline asm // begin inline asm mov.u32 %r1933, 0; // end inline asm // begin inline asm mov.u32 %r1934, 0; // end inline asm // begin inline asm mov.u32 %r1935, 0; // end inline asm // begin inline asm mov.u32 %r1936, 0; // end inline asm // begin inline asm mov.u32 %r1937, 0; // end inline asm // begin inline asm mov.u32 %r1938, 0; // end inline asm // begin inline asm mov.u32 %r1939, 0; // end inline asm // begin inline asm mov.u32 %r1940, 0; // end inline asm // begin inline asm mov.u32 %r1941, 0; // end inline asm // begin inline asm mov.u32 %r1942, 0; // end inline asm // begin inline asm mov.u32 %r1943, 0; // end inline asm // begin inline asm mov.u32 %r1944, 0; // end inline asm // begin inline asm mov.u32 %r1945, 0; // end inline asm // begin inline asm mov.u32 %r1946, 0; // end inline asm // begin inline asm mov.u32 %r1947, 0; // end inline asm // begin inline asm mov.u32 %r1948, 0; // end inline asm // begin inline asm mov.u32 %r1949, 0; // end inline asm // begin inline asm mov.u32 %r1950, 0; // end inline asm // begin inline asm mov.u32 %r1951, 0; // end inline asm // begin inline asm mov.u32 %r1952, 0; // end inline asm // begin inline asm mov.u32 %r1953, 0; // end inline asm // begin inline asm mov.u32 %r1954, 0; // end inline asm // begin inline asm mov.u32 %r1955, 0; // end inline asm // begin inline asm mov.u32 %r1956, 0; // end inline asm // begin inline asm mov.u32 %r1957, 0; // end inline asm // begin inline asm mov.u32 %r1958, 0; // end inline asm // begin inline asm mov.u32 %r1959, 0; // end inline asm // begin inline asm mov.u32 %r1960, 0; // end inline asm // begin inline asm mov.u32 %r1961, 0; // end inline asm mov.b32 %f514, %r1834; mov.b32 %f515, %r1835; mov.b32 %f516, %r1836; mov.b32 %f517, %r1837; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f514, %f515, %f516, %f517}, {%r1830, %r1831, %r1832, %r1833}, {%r1750, %r1751}, {%f514, %f515, %f516, %f517}; // end inline asm mov.b32 %f522, %r1838; mov.b32 %f523, %r1839; mov.b32 %f524, %r1840; mov.b32 %f525, %r1841; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f522, %f523, %f524, %f525}, {%r1830, %r1831, %r1832, %r1833}, {%r1752, %r1753}, {%f522, %f523, %f524, %f525}; // end inline asm mov.b32 %f530, %r1842; mov.b32 %f531, %r1843; mov.b32 %f532, %r1844; mov.b32 %f533, %r1845; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f530, %f531, %f532, %f533}, {%r1830, %r1831, %r1832, %r1833}, {%r1755, %r1756}, {%f530, %f531, %f532, %f533}; // end inline asm mov.b32 %f538, %r1846; mov.b32 %f539, %r1847; mov.b32 %f540, %r1848; mov.b32 %f541, %r1849; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f538, %f539, %f540, %f541}, {%r1830, %r1831, %r1832, %r1833}, {%r1757, %r1758}, {%f538, %f539, %f540, %f541}; // end inline asm mov.b32 %f546, %r1850; mov.b32 %f547, %r1851; mov.b32 %f548, %r1852; mov.b32 %f549, %r1853; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f546, %f547, %f548, %f549}, {%r1830, %r1831, %r1832, %r1833}, {%r1760, %r1761}, {%f546, %f547, %f548, %f549}; // end inline asm mov.b32 %f554, %r1854; mov.b32 %f555, %r1855; mov.b32 %f556, %r1856; mov.b32 %f557, %r1857; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f554, %f555, %f556, %f557}, {%r1830, %r1831, %r1832, %r1833}, {%r1762, %r1763}, {%f554, %f555, %f556, %f557}; // end inline asm mov.b32 %f562, %r1858; mov.b32 %f563, %r1859; mov.b32 %f564, %r1860; mov.b32 %f565, %r1861; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f562, %f563, %f564, %f565}, {%r1830, %r1831, %r1832, %r1833}, {%r1765, %r1766}, {%f562, %f563, %f564, %f565}; // end inline asm mov.b32 %f570, %r1862; mov.b32 %f571, %r1863; mov.b32 %f572, %r1864; mov.b32 %f573, %r1865; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f570, %f571, %f572, %f573}, {%r1830, %r1831, %r1832, %r1833}, {%r1767, %r1768}, {%f570, %f571, %f572, %f573}; // end inline asm mov.b32 %f578, %r1866; mov.b32 %f579, %r1867; mov.b32 %f580, %r1868; mov.b32 %f581, %r1869; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f578, %f579, %f580, %f581}, {%r1830, %r1831, %r1832, %r1833}, {%r1770, %r1771}, {%f578, %f579, %f580, %f581}; // end inline asm mov.b32 %f586, %r1870; mov.b32 %f587, %r1871; mov.b32 %f588, %r1872; mov.b32 %f589, %r1873; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f586, %f587, %f588, %f589}, {%r1830, %r1831, %r1832, %r1833}, {%r1772, %r1773}, {%f586, %f587, %f588, %f589}; // end inline asm mov.b32 %f594, %r1874; mov.b32 %f595, %r1875; mov.b32 %f596, %r1876; mov.b32 %f597, %r1877; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f594, %f595, %f596, %f597}, {%r1830, %r1831, %r1832, %r1833}, {%r1775, %r1776}, {%f594, %f595, %f596, %f597}; // end inline asm mov.b32 %f602, %r1878; mov.b32 %f603, %r1879; mov.b32 %f604, %r1880; mov.b32 %f605, %r1881; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f602, %f603, %f604, %f605}, {%r1830, %r1831, %r1832, %r1833}, {%r1777, %r1778}, {%f602, %f603, %f604, %f605}; // end inline asm mov.b32 %f610, %r1882; mov.b32 %f611, %r1883; mov.b32 %f612, %r1884; mov.b32 %f613, %r1885; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f610, %f611, %f612, %f613}, {%r1830, %r1831, %r1832, %r1833}, {%r1780, %r1781}, {%f610, %f611, %f612, %f613}; // end inline asm mov.b32 %f618, %r1886; mov.b32 %f619, %r1887; mov.b32 %f620, %r1888; mov.b32 %f621, %r1889; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f618, %f619, %f620, %f621}, {%r1830, %r1831, %r1832, %r1833}, {%r1782, %r1783}, {%f618, %f619, %f620, %f621}; // end inline asm mov.b32 %f626, %r1890; mov.b32 %f627, %r1891; mov.b32 %f628, %r1892; mov.b32 %f629, %r1893; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f626, %f627, %f628, %f629}, {%r1830, %r1831, %r1832, %r1833}, {%r1785, %r1786}, {%f626, %f627, %f628, %f629}; // end inline asm mov.b32 %f634, %r1894; mov.b32 %f635, %r1895; mov.b32 %f636, %r1896; mov.b32 %f637, %r1897; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f634, %f635, %f636, %f637}, {%r1830, %r1831, %r1832, %r1833}, {%r1787, %r1788}, {%f634, %f635, %f636, %f637}; // end inline asm mov.b32 %f642, %r1898; mov.b32 %f643, %r1899; mov.b32 %f644, %r1900; mov.b32 %f645, %r1901; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f642, %f643, %f644, %f645}, {%r1830, %r1831, %r1832, %r1833}, {%r1790, %r1791}, {%f642, %f643, %f644, %f645}; // end inline asm mov.b32 %f650, %r1902; mov.b32 %f651, %r1903; mov.b32 %f652, %r1904; mov.b32 %f653, %r1905; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f650, %f651, %f652, %f653}, {%r1830, %r1831, %r1832, %r1833}, {%r1792, %r1793}, {%f650, %f651, %f652, %f653}; // end inline asm mov.b32 %f658, %r1906; mov.b32 %f659, %r1907; mov.b32 %f660, %r1908; mov.b32 %f661, %r1909; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f658, %f659, %f660, %f661}, {%r1830, %r1831, %r1832, %r1833}, {%r1795, %r1796}, {%f658, %f659, %f660, %f661}; // end inline asm mov.b32 %f666, %r1910; mov.b32 %f667, %r1911; mov.b32 %f668, %r1912; mov.b32 %f669, %r1913; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f666, %f667, %f668, %f669}, {%r1830, %r1831, %r1832, %r1833}, {%r1797, %r1798}, {%f666, %f667, %f668, %f669}; // end inline asm mov.b32 %f674, %r1914; mov.b32 %f675, %r1915; mov.b32 %f676, %r1916; mov.b32 %f677, %r1917; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f674, %f675, %f676, %f677}, {%r1830, %r1831, %r1832, %r1833}, {%r1800, %r1801}, {%f674, %f675, %f676, %f677}; // end inline asm mov.b32 %f682, %r1918; mov.b32 %f683, %r1919; mov.b32 %f684, %r1920; mov.b32 %f685, %r1921; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f682, %f683, %f684, %f685}, {%r1830, %r1831, %r1832, %r1833}, {%r1802, %r1803}, {%f682, %f683, %f684, %f685}; // end inline asm mov.b32 %f690, %r1922; mov.b32 %f691, %r1923; mov.b32 %f692, %r1924; mov.b32 %f693, %r1925; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f690, %f691, %f692, %f693}, {%r1830, %r1831, %r1832, %r1833}, {%r1805, %r1806}, {%f690, %f691, %f692, %f693}; // end inline asm mov.b32 %f698, %r1926; mov.b32 %f699, %r1927; mov.b32 %f700, %r1928; mov.b32 %f701, %r1929; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f698, %f699, %f700, %f701}, {%r1830, %r1831, %r1832, %r1833}, {%r1807, %r1808}, {%f698, %f699, %f700, %f701}; // end inline asm mov.b32 %f706, %r1930; mov.b32 %f707, %r1931; mov.b32 %f708, %r1932; mov.b32 %f709, %r1933; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f706, %f707, %f708, %f709}, {%r1830, %r1831, %r1832, %r1833}, {%r1810, %r1811}, {%f706, %f707, %f708, %f709}; // end inline asm mov.b32 %f714, %r1934; mov.b32 %f715, %r1935; mov.b32 %f716, %r1936; mov.b32 %f717, %r1937; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f714, %f715, %f716, %f717}, {%r1830, %r1831, %r1832, %r1833}, {%r1812, %r1813}, {%f714, %f715, %f716, %f717}; // end inline asm mov.b32 %f722, %r1938; mov.b32 %f723, %r1939; mov.b32 %f724, %r1940; mov.b32 %f725, %r1941; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f722, %f723, %f724, %f725}, {%r1830, %r1831, %r1832, %r1833}, {%r1815, %r1816}, {%f722, %f723, %f724, %f725}; // end inline asm mov.b32 %f730, %r1942; mov.b32 %f731, %r1943; mov.b32 %f732, %r1944; mov.b32 %f733, %r1945; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f730, %f731, %f732, %f733}, {%r1830, %r1831, %r1832, %r1833}, {%r1817, %r1818}, {%f730, %f731, %f732, %f733}; // end inline asm mov.b32 %f738, %r1946; mov.b32 %f739, %r1947; mov.b32 %f740, %r1948; mov.b32 %f741, %r1949; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f738, %f739, %f740, %f741}, {%r1830, %r1831, %r1832, %r1833}, {%r1820, %r1821}, {%f738, %f739, %f740, %f741}; // end inline asm mov.b32 %f746, %r1950; mov.b32 %f747, %r1951; mov.b32 %f748, %r1952; mov.b32 %f749, %r1953; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f746, %f747, %f748, %f749}, {%r1830, %r1831, %r1832, %r1833}, {%r1822, %r1823}, {%f746, %f747, %f748, %f749}; // end inline asm mov.b32 %f754, %r1954; mov.b32 %f755, %r1955; mov.b32 %f756, %r1956; mov.b32 %f757, %r1957; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f754, %f755, %f756, %f757}, {%r1830, %r1831, %r1832, %r1833}, {%r1825, %r1826}, {%f754, %f755, %f756, %f757}; // end inline asm mov.b32 %f762, %r1958; mov.b32 %f763, %r1959; mov.b32 %f764, %r1960; mov.b32 %f765, %r1961; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f762, %f763, %f764, %f765}, {%r1830, %r1831, %r1832, %r1833}, {%r1827, %r1828}, {%f762, %f763, %f764, %f765}; // end inline asm setp.equ.ftz.f32 %p92, %f1285, 0f00000000; mov.f32 %f1297, 0f3F800000; mov.f32 %f1296, %f1297; @%p92 bra $L__BB0_62; rcp.approx.ftz.f32 %f1296, %f1285; $L__BB0_62: setp.equ.ftz.f32 %p93, %f1284, 0f00000000; @%p93 bra $L__BB0_64; rcp.approx.ftz.f32 %f1297, %f1284; $L__BB0_64: add.s32 %r2730, %r2730, 16; setp.lt.s32 %p128, %r2730, %r233; mov.b32 %f772, %r2921; fma.rn.ftz.f32 %f773, %f42, %f772, %f514; mul.ftz.f32 %f774, %f1296, %f773; mov.b32 %r2921, %f774; mov.b32 %f775, %r2920; fma.rn.ftz.f32 %f776, %f42, %f775, %f515; mul.ftz.f32 %f777, %f1296, %f776; mov.b32 %r2920, %f777; mov.b32 %f778, %r2919; fma.rn.ftz.f32 %f779, %f44, %f778, %f516; mul.ftz.f32 %f780, %f1297, %f779; mov.b32 %r2919, %f780; mov.b32 %f781, %r2918; fma.rn.ftz.f32 %f782, %f44, %f781, %f517; mul.ftz.f32 %f783, %f1297, %f782; mov.b32 %r2918, %f783; mov.b32 %f784, %r2917; fma.rn.ftz.f32 %f785, %f42, %f784, %f522; mul.ftz.f32 %f786, %f1296, %f785; mov.b32 %r2917, %f786; mov.b32 %f787, %r2916; fma.rn.ftz.f32 %f788, %f42, %f787, %f523; mul.ftz.f32 %f789, %f1296, %f788; mov.b32 %r2916, %f789; mov.b32 %f790, %r2915; fma.rn.ftz.f32 %f791, %f44, %f790, %f524; mul.ftz.f32 %f792, %f1297, %f791; mov.b32 %r2915, %f792; mov.b32 %f793, %r2914; fma.rn.ftz.f32 %f794, %f44, %f793, %f525; mul.ftz.f32 %f795, %f1297, %f794; mov.b32 %r2914, %f795; mov.b32 %f796, %r2913; fma.rn.ftz.f32 %f797, %f42, %f796, %f530; mul.ftz.f32 %f798, %f1296, %f797; mov.b32 %r2913, %f798; mov.b32 %f799, %r2912; fma.rn.ftz.f32 %f800, %f42, %f799, %f531; mul.ftz.f32 %f801, %f1296, %f800; mov.b32 %r2912, %f801; mov.b32 %f802, %r2911; fma.rn.ftz.f32 %f803, %f44, %f802, %f532; mul.ftz.f32 %f804, %f1297, %f803; mov.b32 %r2911, %f804; mov.b32 %f805, %r2910; fma.rn.ftz.f32 %f806, %f44, %f805, %f533; mul.ftz.f32 %f807, %f1297, %f806; mov.b32 %r2910, %f807; mov.b32 %f808, %r2909; fma.rn.ftz.f32 %f809, %f42, %f808, %f538; mul.ftz.f32 %f810, %f1296, %f809; mov.b32 %r2909, %f810; mov.b32 %f811, %r2908; fma.rn.ftz.f32 %f812, %f42, %f811, %f539; mul.ftz.f32 %f813, %f1296, %f812; mov.b32 %r2908, %f813; mov.b32 %f814, %r2907; fma.rn.ftz.f32 %f815, %f44, %f814, %f540; mul.ftz.f32 %f816, %f1297, %f815; mov.b32 %r2907, %f816; mov.b32 %f817, %r2906; fma.rn.ftz.f32 %f818, %f44, %f817, %f541; mul.ftz.f32 %f819, %f1297, %f818; mov.b32 %r2906, %f819; mov.b32 %f820, %r2905; fma.rn.ftz.f32 %f821, %f42, %f820, %f546; mul.ftz.f32 %f822, %f1296, %f821; mov.b32 %r2905, %f822; mov.b32 %f823, %r2904; fma.rn.ftz.f32 %f824, %f42, %f823, %f547; mul.ftz.f32 %f825, %f1296, %f824; mov.b32 %r2904, %f825; mov.b32 %f826, %r2903; fma.rn.ftz.f32 %f827, %f44, %f826, %f548; mul.ftz.f32 %f828, %f1297, %f827; mov.b32 %r2903, %f828; mov.b32 %f829, %r2902; fma.rn.ftz.f32 %f830, %f44, %f829, %f549; mul.ftz.f32 %f831, %f1297, %f830; mov.b32 %r2902, %f831; mov.b32 %f832, %r2901; fma.rn.ftz.f32 %f833, %f42, %f832, %f554; mul.ftz.f32 %f834, %f1296, %f833; mov.b32 %r2901, %f834; mov.b32 %f835, %r2900; fma.rn.ftz.f32 %f836, %f42, %f835, %f555; mul.ftz.f32 %f837, %f1296, %f836; mov.b32 %r2900, %f837; mov.b32 %f838, %r2899; fma.rn.ftz.f32 %f839, %f44, %f838, %f556; mul.ftz.f32 %f840, %f1297, %f839; mov.b32 %r2899, %f840; mov.b32 %f841, %r2898; fma.rn.ftz.f32 %f842, %f44, %f841, %f557; mul.ftz.f32 %f843, %f1297, %f842; mov.b32 %r2898, %f843; mov.b32 %f844, %r2897; fma.rn.ftz.f32 %f845, %f42, %f844, %f562; mul.ftz.f32 %f846, %f1296, %f845; mov.b32 %r2897, %f846; mov.b32 %f847, %r2896; fma.rn.ftz.f32 %f848, %f42, %f847, %f563; mul.ftz.f32 %f849, %f1296, %f848; mov.b32 %r2896, %f849; mov.b32 %f850, %r2895; fma.rn.ftz.f32 %f851, %f44, %f850, %f564; mul.ftz.f32 %f852, %f1297, %f851; mov.b32 %r2895, %f852; mov.b32 %f853, %r2894; fma.rn.ftz.f32 %f854, %f44, %f853, %f565; mul.ftz.f32 %f855, %f1297, %f854; mov.b32 %r2894, %f855; mov.b32 %f856, %r2893; fma.rn.ftz.f32 %f857, %f42, %f856, %f570; mul.ftz.f32 %f858, %f1296, %f857; mov.b32 %r2893, %f858; mov.b32 %f859, %r2892; fma.rn.ftz.f32 %f860, %f42, %f859, %f571; mul.ftz.f32 %f861, %f1296, %f860; mov.b32 %r2892, %f861; mov.b32 %f862, %r2891; fma.rn.ftz.f32 %f863, %f44, %f862, %f572; mul.ftz.f32 %f864, %f1297, %f863; mov.b32 %r2891, %f864; mov.b32 %f865, %r2890; fma.rn.ftz.f32 %f866, %f44, %f865, %f573; mul.ftz.f32 %f867, %f1297, %f866; mov.b32 %r2890, %f867; mov.b32 %f868, %r2889; fma.rn.ftz.f32 %f869, %f42, %f868, %f578; mul.ftz.f32 %f870, %f1296, %f869; mov.b32 %r2889, %f870; mov.b32 %f871, %r2888; fma.rn.ftz.f32 %f872, %f42, %f871, %f579; mul.ftz.f32 %f873, %f1296, %f872; mov.b32 %r2888, %f873; mov.b32 %f874, %r2887; fma.rn.ftz.f32 %f875, %f44, %f874, %f580; mul.ftz.f32 %f876, %f1297, %f875; mov.b32 %r2887, %f876; mov.b32 %f877, %r2886; fma.rn.ftz.f32 %f878, %f44, %f877, %f581; mul.ftz.f32 %f879, %f1297, %f878; mov.b32 %r2886, %f879; mov.b32 %f880, %r2885; fma.rn.ftz.f32 %f881, %f42, %f880, %f586; mul.ftz.f32 %f882, %f1296, %f881; mov.b32 %r2885, %f882; mov.b32 %f883, %r2884; fma.rn.ftz.f32 %f884, %f42, %f883, %f587; mul.ftz.f32 %f885, %f1296, %f884; mov.b32 %r2884, %f885; mov.b32 %f886, %r2883; fma.rn.ftz.f32 %f887, %f44, %f886, %f588; mul.ftz.f32 %f888, %f1297, %f887; mov.b32 %r2883, %f888; mov.b32 %f889, %r2882; fma.rn.ftz.f32 %f890, %f44, %f889, %f589; mul.ftz.f32 %f891, %f1297, %f890; mov.b32 %r2882, %f891; mov.b32 %f892, %r2881; fma.rn.ftz.f32 %f893, %f42, %f892, %f594; mul.ftz.f32 %f894, %f1296, %f893; mov.b32 %r2881, %f894; mov.b32 %f895, %r2880; fma.rn.ftz.f32 %f896, %f42, %f895, %f595; mul.ftz.f32 %f897, %f1296, %f896; mov.b32 %r2880, %f897; mov.b32 %f898, %r2879; fma.rn.ftz.f32 %f899, %f44, %f898, %f596; mul.ftz.f32 %f900, %f1297, %f899; mov.b32 %r2879, %f900; mov.b32 %f901, %r2878; fma.rn.ftz.f32 %f902, %f44, %f901, %f597; mul.ftz.f32 %f903, %f1297, %f902; mov.b32 %r2878, %f903; mov.b32 %f904, %r2877; fma.rn.ftz.f32 %f905, %f42, %f904, %f602; mul.ftz.f32 %f906, %f1296, %f905; mov.b32 %r2877, %f906; mov.b32 %f907, %r2876; fma.rn.ftz.f32 %f908, %f42, %f907, %f603; mul.ftz.f32 %f909, %f1296, %f908; mov.b32 %r2876, %f909; mov.b32 %f910, %r2875; fma.rn.ftz.f32 %f911, %f44, %f910, %f604; mul.ftz.f32 %f912, %f1297, %f911; mov.b32 %r2875, %f912; mov.b32 %f913, %r2874; fma.rn.ftz.f32 %f914, %f44, %f913, %f605; mul.ftz.f32 %f915, %f1297, %f914; mov.b32 %r2874, %f915; mov.b32 %f916, %r2873; fma.rn.ftz.f32 %f917, %f42, %f916, %f610; mul.ftz.f32 %f918, %f1296, %f917; mov.b32 %r2873, %f918; mov.b32 %f919, %r2872; fma.rn.ftz.f32 %f920, %f42, %f919, %f611; mul.ftz.f32 %f921, %f1296, %f920; mov.b32 %r2872, %f921; mov.b32 %f922, %r2871; fma.rn.ftz.f32 %f923, %f44, %f922, %f612; mul.ftz.f32 %f924, %f1297, %f923; mov.b32 %r2871, %f924; mov.b32 %f925, %r2870; fma.rn.ftz.f32 %f926, %f44, %f925, %f613; mul.ftz.f32 %f927, %f1297, %f926; mov.b32 %r2870, %f927; mov.b32 %f928, %r2869; fma.rn.ftz.f32 %f929, %f42, %f928, %f618; mul.ftz.f32 %f930, %f1296, %f929; mov.b32 %r2869, %f930; mov.b32 %f931, %r2868; fma.rn.ftz.f32 %f932, %f42, %f931, %f619; mul.ftz.f32 %f933, %f1296, %f932; mov.b32 %r2868, %f933; mov.b32 %f934, %r2867; fma.rn.ftz.f32 %f935, %f44, %f934, %f620; mul.ftz.f32 %f936, %f1297, %f935; mov.b32 %r2867, %f936; mov.b32 %f937, %r2866; fma.rn.ftz.f32 %f938, %f44, %f937, %f621; mul.ftz.f32 %f939, %f1297, %f938; mov.b32 %r2866, %f939; mov.b32 %f940, %r2865; fma.rn.ftz.f32 %f941, %f42, %f940, %f626; mul.ftz.f32 %f942, %f1296, %f941; mov.b32 %r2865, %f942; mov.b32 %f943, %r2864; fma.rn.ftz.f32 %f944, %f42, %f943, %f627; mul.ftz.f32 %f945, %f1296, %f944; mov.b32 %r2864, %f945; mov.b32 %f946, %r2863; fma.rn.ftz.f32 %f947, %f44, %f946, %f628; mul.ftz.f32 %f948, %f1297, %f947; mov.b32 %r2863, %f948; mov.b32 %f949, %r2862; fma.rn.ftz.f32 %f950, %f44, %f949, %f629; mul.ftz.f32 %f951, %f1297, %f950; mov.b32 %r2862, %f951; mov.b32 %f952, %r2861; fma.rn.ftz.f32 %f953, %f42, %f952, %f634; mul.ftz.f32 %f954, %f1296, %f953; mov.b32 %r2861, %f954; mov.b32 %f955, %r2860; fma.rn.ftz.f32 %f956, %f42, %f955, %f635; mul.ftz.f32 %f957, %f1296, %f956; mov.b32 %r2860, %f957; mov.b32 %f958, %r2859; fma.rn.ftz.f32 %f959, %f44, %f958, %f636; mul.ftz.f32 %f960, %f1297, %f959; mov.b32 %r2859, %f960; mov.b32 %f961, %r2858; fma.rn.ftz.f32 %f962, %f44, %f961, %f637; mul.ftz.f32 %f963, %f1297, %f962; mov.b32 %r2858, %f963; mov.b32 %f964, %r2857; fma.rn.ftz.f32 %f965, %f42, %f964, %f642; mul.ftz.f32 %f966, %f1296, %f965; mov.b32 %r2857, %f966; mov.b32 %f967, %r2856; fma.rn.ftz.f32 %f968, %f42, %f967, %f643; mul.ftz.f32 %f969, %f1296, %f968; mov.b32 %r2856, %f969; mov.b32 %f970, %r2855; fma.rn.ftz.f32 %f971, %f44, %f970, %f644; mul.ftz.f32 %f972, %f1297, %f971; mov.b32 %r2855, %f972; mov.b32 %f973, %r2854; fma.rn.ftz.f32 %f974, %f44, %f973, %f645; mul.ftz.f32 %f975, %f1297, %f974; mov.b32 %r2854, %f975; mov.b32 %f976, %r2853; fma.rn.ftz.f32 %f977, %f42, %f976, %f650; mul.ftz.f32 %f978, %f1296, %f977; mov.b32 %r2853, %f978; mov.b32 %f979, %r2852; fma.rn.ftz.f32 %f980, %f42, %f979, %f651; mul.ftz.f32 %f981, %f1296, %f980; mov.b32 %r2852, %f981; mov.b32 %f982, %r2851; fma.rn.ftz.f32 %f983, %f44, %f982, %f652; mul.ftz.f32 %f984, %f1297, %f983; mov.b32 %r2851, %f984; mov.b32 %f985, %r2850; fma.rn.ftz.f32 %f986, %f44, %f985, %f653; mul.ftz.f32 %f987, %f1297, %f986; mov.b32 %r2850, %f987; mov.b32 %f988, %r2849; fma.rn.ftz.f32 %f989, %f42, %f988, %f658; mul.ftz.f32 %f990, %f1296, %f989; mov.b32 %r2849, %f990; mov.b32 %f991, %r2848; fma.rn.ftz.f32 %f992, %f42, %f991, %f659; mul.ftz.f32 %f993, %f1296, %f992; mov.b32 %r2848, %f993; mov.b32 %f994, %r2847; fma.rn.ftz.f32 %f995, %f44, %f994, %f660; mul.ftz.f32 %f996, %f1297, %f995; mov.b32 %r2847, %f996; mov.b32 %f997, %r2846; fma.rn.ftz.f32 %f998, %f44, %f997, %f661; mul.ftz.f32 %f999, %f1297, %f998; mov.b32 %r2846, %f999; mov.b32 %f1000, %r2845; fma.rn.ftz.f32 %f1001, %f42, %f1000, %f666; mul.ftz.f32 %f1002, %f1296, %f1001; mov.b32 %r2845, %f1002; mov.b32 %f1003, %r2844; fma.rn.ftz.f32 %f1004, %f42, %f1003, %f667; mul.ftz.f32 %f1005, %f1296, %f1004; mov.b32 %r2844, %f1005; mov.b32 %f1006, %r2843; fma.rn.ftz.f32 %f1007, %f44, %f1006, %f668; mul.ftz.f32 %f1008, %f1297, %f1007; mov.b32 %r2843, %f1008; mov.b32 %f1009, %r2842; fma.rn.ftz.f32 %f1010, %f44, %f1009, %f669; mul.ftz.f32 %f1011, %f1297, %f1010; mov.b32 %r2842, %f1011; mov.b32 %f1012, %r2841; fma.rn.ftz.f32 %f1013, %f42, %f1012, %f674; mul.ftz.f32 %f1014, %f1296, %f1013; mov.b32 %r2841, %f1014; mov.b32 %f1015, %r2840; fma.rn.ftz.f32 %f1016, %f42, %f1015, %f675; mul.ftz.f32 %f1017, %f1296, %f1016; mov.b32 %r2840, %f1017; mov.b32 %f1018, %r2839; fma.rn.ftz.f32 %f1019, %f44, %f1018, %f676; mul.ftz.f32 %f1020, %f1297, %f1019; mov.b32 %r2839, %f1020; mov.b32 %f1021, %r2838; fma.rn.ftz.f32 %f1022, %f44, %f1021, %f677; mul.ftz.f32 %f1023, %f1297, %f1022; mov.b32 %r2838, %f1023; mov.b32 %f1024, %r2837; fma.rn.ftz.f32 %f1025, %f42, %f1024, %f682; mul.ftz.f32 %f1026, %f1296, %f1025; mov.b32 %r2837, %f1026; mov.b32 %f1027, %r2836; fma.rn.ftz.f32 %f1028, %f42, %f1027, %f683; mul.ftz.f32 %f1029, %f1296, %f1028; mov.b32 %r2836, %f1029; mov.b32 %f1030, %r2835; fma.rn.ftz.f32 %f1031, %f44, %f1030, %f684; mul.ftz.f32 %f1032, %f1297, %f1031; mov.b32 %r2835, %f1032; mov.b32 %f1033, %r2834; fma.rn.ftz.f32 %f1034, %f44, %f1033, %f685; mul.ftz.f32 %f1035, %f1297, %f1034; mov.b32 %r2834, %f1035; mov.b32 %f1036, %r2833; fma.rn.ftz.f32 %f1037, %f42, %f1036, %f690; mul.ftz.f32 %f1038, %f1296, %f1037; mov.b32 %r2833, %f1038; mov.b32 %f1039, %r2832; fma.rn.ftz.f32 %f1040, %f42, %f1039, %f691; mul.ftz.f32 %f1041, %f1296, %f1040; mov.b32 %r2832, %f1041; mov.b32 %f1042, %r2831; fma.rn.ftz.f32 %f1043, %f44, %f1042, %f692; mul.ftz.f32 %f1044, %f1297, %f1043; mov.b32 %r2831, %f1044; mov.b32 %f1045, %r2830; fma.rn.ftz.f32 %f1046, %f44, %f1045, %f693; mul.ftz.f32 %f1047, %f1297, %f1046; mov.b32 %r2830, %f1047; mov.b32 %f1048, %r2829; fma.rn.ftz.f32 %f1049, %f42, %f1048, %f698; mul.ftz.f32 %f1050, %f1296, %f1049; mov.b32 %r2829, %f1050; mov.b32 %f1051, %r2828; fma.rn.ftz.f32 %f1052, %f42, %f1051, %f699; mul.ftz.f32 %f1053, %f1296, %f1052; mov.b32 %r2828, %f1053; mov.b32 %f1054, %r2827; fma.rn.ftz.f32 %f1055, %f44, %f1054, %f700; mul.ftz.f32 %f1056, %f1297, %f1055; mov.b32 %r2827, %f1056; mov.b32 %f1057, %r2826; fma.rn.ftz.f32 %f1058, %f44, %f1057, %f701; mul.ftz.f32 %f1059, %f1297, %f1058; mov.b32 %r2826, %f1059; mov.b32 %f1060, %r2825; fma.rn.ftz.f32 %f1061, %f42, %f1060, %f706; mul.ftz.f32 %f1062, %f1296, %f1061; mov.b32 %r2825, %f1062; mov.b32 %f1063, %r2824; fma.rn.ftz.f32 %f1064, %f42, %f1063, %f707; mul.ftz.f32 %f1065, %f1296, %f1064; mov.b32 %r2824, %f1065; mov.b32 %f1066, %r2823; fma.rn.ftz.f32 %f1067, %f44, %f1066, %f708; mul.ftz.f32 %f1068, %f1297, %f1067; mov.b32 %r2823, %f1068; mov.b32 %f1069, %r2822; fma.rn.ftz.f32 %f1070, %f44, %f1069, %f709; mul.ftz.f32 %f1071, %f1297, %f1070; mov.b32 %r2822, %f1071; mov.b32 %f1072, %r2821; fma.rn.ftz.f32 %f1073, %f42, %f1072, %f714; mul.ftz.f32 %f1074, %f1296, %f1073; mov.b32 %r2821, %f1074; mov.b32 %f1075, %r2820; fma.rn.ftz.f32 %f1076, %f42, %f1075, %f715; mul.ftz.f32 %f1077, %f1296, %f1076; mov.b32 %r2820, %f1077; mov.b32 %f1078, %r2819; fma.rn.ftz.f32 %f1079, %f44, %f1078, %f716; mul.ftz.f32 %f1080, %f1297, %f1079; mov.b32 %r2819, %f1080; mov.b32 %f1081, %r2818; fma.rn.ftz.f32 %f1082, %f44, %f1081, %f717; mul.ftz.f32 %f1083, %f1297, %f1082; mov.b32 %r2818, %f1083; mov.b32 %f1084, %r2817; fma.rn.ftz.f32 %f1085, %f42, %f1084, %f722; mul.ftz.f32 %f1086, %f1296, %f1085; mov.b32 %r2817, %f1086; mov.b32 %f1087, %r2816; fma.rn.ftz.f32 %f1088, %f42, %f1087, %f723; mul.ftz.f32 %f1089, %f1296, %f1088; mov.b32 %r2816, %f1089; mov.b32 %f1090, %r2815; fma.rn.ftz.f32 %f1091, %f44, %f1090, %f724; mul.ftz.f32 %f1092, %f1297, %f1091; mov.b32 %r2815, %f1092; mov.b32 %f1093, %r2814; fma.rn.ftz.f32 %f1094, %f44, %f1093, %f725; mul.ftz.f32 %f1095, %f1297, %f1094; mov.b32 %r2814, %f1095; mov.b32 %f1096, %r2813; fma.rn.ftz.f32 %f1097, %f42, %f1096, %f730; mul.ftz.f32 %f1098, %f1296, %f1097; mov.b32 %r2813, %f1098; mov.b32 %f1099, %r2812; fma.rn.ftz.f32 %f1100, %f42, %f1099, %f731; mul.ftz.f32 %f1101, %f1296, %f1100; mov.b32 %r2812, %f1101; mov.b32 %f1102, %r2811; fma.rn.ftz.f32 %f1103, %f44, %f1102, %f732; mul.ftz.f32 %f1104, %f1297, %f1103; mov.b32 %r2811, %f1104; mov.b32 %f1105, %r2810; fma.rn.ftz.f32 %f1106, %f44, %f1105, %f733; mul.ftz.f32 %f1107, %f1297, %f1106; mov.b32 %r2810, %f1107; mov.b32 %f1108, %r2809; fma.rn.ftz.f32 %f1109, %f42, %f1108, %f738; mul.ftz.f32 %f1110, %f1296, %f1109; mov.b32 %r2809, %f1110; mov.b32 %f1111, %r2808; fma.rn.ftz.f32 %f1112, %f42, %f1111, %f739; mul.ftz.f32 %f1113, %f1296, %f1112; mov.b32 %r2808, %f1113; mov.b32 %f1114, %r2807; fma.rn.ftz.f32 %f1115, %f44, %f1114, %f740; mul.ftz.f32 %f1116, %f1297, %f1115; mov.b32 %r2807, %f1116; mov.b32 %f1117, %r2806; fma.rn.ftz.f32 %f1118, %f44, %f1117, %f741; mul.ftz.f32 %f1119, %f1297, %f1118; mov.b32 %r2806, %f1119; mov.b32 %f1120, %r2805; fma.rn.ftz.f32 %f1121, %f42, %f1120, %f746; mul.ftz.f32 %f1122, %f1296, %f1121; mov.b32 %r2805, %f1122; mov.b32 %f1123, %r2804; fma.rn.ftz.f32 %f1124, %f42, %f1123, %f747; mul.ftz.f32 %f1125, %f1296, %f1124; mov.b32 %r2804, %f1125; mov.b32 %f1126, %r2803; fma.rn.ftz.f32 %f1127, %f44, %f1126, %f748; mul.ftz.f32 %f1128, %f1297, %f1127; mov.b32 %r2803, %f1128; mov.b32 %f1129, %r2802; fma.rn.ftz.f32 %f1130, %f44, %f1129, %f749; mul.ftz.f32 %f1131, %f1297, %f1130; mov.b32 %r2802, %f1131; mov.b32 %f1132, %r2801; fma.rn.ftz.f32 %f1133, %f42, %f1132, %f754; mul.ftz.f32 %f1134, %f1296, %f1133; mov.b32 %r2801, %f1134; mov.b32 %f1135, %r2800; fma.rn.ftz.f32 %f1136, %f42, %f1135, %f755; mul.ftz.f32 %f1137, %f1296, %f1136; mov.b32 %r2800, %f1137; mov.b32 %f1138, %r2799; fma.rn.ftz.f32 %f1139, %f44, %f1138, %f756; mul.ftz.f32 %f1140, %f1297, %f1139; mov.b32 %r2799, %f1140; mov.b32 %f1141, %r2798; fma.rn.ftz.f32 %f1142, %f44, %f1141, %f757; mul.ftz.f32 %f1143, %f1297, %f1142; mov.b32 %r2798, %f1143; mov.b32 %f1144, %r2797; fma.rn.ftz.f32 %f1145, %f42, %f1144, %f762; mul.ftz.f32 %f1146, %f1296, %f1145; mov.b32 %r2797, %f1146; mov.b32 %f1147, %r2796; fma.rn.ftz.f32 %f1148, %f42, %f1147, %f763; mul.ftz.f32 %f1149, %f1296, %f1148; mov.b32 %r2796, %f1149; mov.b32 %f1150, %r2795; fma.rn.ftz.f32 %f1151, %f44, %f1150, %f764; mul.ftz.f32 %f1152, %f1297, %f1151; mov.b32 %r2795, %f1152; mov.b32 %f1153, %r2794; fma.rn.ftz.f32 %f1154, %f44, %f1153, %f765; mul.ftz.f32 %f1155, %f1297, %f1154; mov.b32 %r2794, %f1155; bar.sync 0; // begin inline asm st.shared.v4.b32 [%r1050], {%r2766, %r2767, %r2768, %r2769}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1055], {%r2770, %r2771, %r2772, %r2773}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1060], {%r2774, %r2775, %r2776, %r2777}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1065], {%r2778, %r2779, %r2780, %r2781}; // end inline asm add.s32 %r2731, %r2731, 16; mov.f32 %f1286, %f31; mov.f32 %f1287, %f30; mov.u32 %r2732, %r2793; mov.u32 %r2733, %r2792; @%p128 bra $L__BB0_19; $L__BB0_65: mov.b64 %rd212, fmha_v2_flash_attention_fp16_64_16_S_256_sm86_kernel_nl_param_0; mov.u64 %rd211, %rd212; ld.param.u32 %r2528, [%rd211+60]; mul.lo.s32 %r2527, %r1, %r879; mad.lo.s32 %r2526, %r2527, %r2, %r880; and.b32 %r2525, %r878, 96; shr.u32 %r2524, %r2525, 1; add.s32 %r2523, %r954, 32768; add.s32 %r2395, %r17, %r2523; bar.sync 0; mov.b32 %f1156, %r2920; mov.b32 %f1157, %r2921; // begin inline asm cvt.rn.f16x2.f32 %r2199, %f1156, %f1157; // end inline asm mov.b32 %f1158, %r2918; mov.b32 %f1159, %r2919; // begin inline asm cvt.rn.f16x2.f32 %r2200, %f1158, %f1159; // end inline asm shl.b32 %r2474, %r878, 2; and.b32 %r2475, %r2474, 124; add.s32 %r2476, %r2475, %r2523; and.b32 %r2479, %r878, 28; shr.u32 %r2480, %r2479, 2; or.b32 %r2481, %r2524, %r2480; shl.b32 %r2482, %r2481, 9; add.s32 %r2201, %r2476, %r2482; // begin inline asm st.shared.b32 [%r2201], %r2199; // end inline asm add.s32 %r2203, %r2201, 4096; // begin inline asm st.shared.b32 [%r2203], %r2200; // end inline asm xor.b32 %r2207, %r2201, 16; mov.b32 %f1160, %r2916; mov.b32 %f1161, %r2917; // begin inline asm cvt.rn.f16x2.f32 %r2205, %f1160, %f1161; // end inline asm mov.b32 %f1162, %r2914; mov.b32 %f1163, %r2915; // begin inline asm cvt.rn.f16x2.f32 %r2206, %f1162, %f1163; // end inline asm // begin inline asm st.shared.b32 [%r2207], %r2205; // end inline asm add.s32 %r2209, %r2207, 4096; // begin inline asm st.shared.b32 [%r2209], %r2206; // end inline asm xor.b32 %r2213, %r2201, 32; mov.b32 %f1164, %r2912; mov.b32 %f1165, %r2913; // begin inline asm cvt.rn.f16x2.f32 %r2211, %f1164, %f1165; // end inline asm mov.b32 %f1166, %r2910; mov.b32 %f1167, %r2911; // begin inline asm cvt.rn.f16x2.f32 %r2212, %f1166, %f1167; // end inline asm // begin inline asm st.shared.b32 [%r2213], %r2211; // end inline asm add.s32 %r2215, %r2213, 4096; // begin inline asm st.shared.b32 [%r2215], %r2212; // end inline asm xor.b32 %r2219, %r2201, 48; mov.b32 %f1168, %r2908; mov.b32 %f1169, %r2909; // begin inline asm cvt.rn.f16x2.f32 %r2217, %f1168, %f1169; // end inline asm mov.b32 %f1170, %r2906; mov.b32 %f1171, %r2907; // begin inline asm cvt.rn.f16x2.f32 %r2218, %f1170, %f1171; // end inline asm // begin inline asm st.shared.b32 [%r2219], %r2217; // end inline asm add.s32 %r2221, %r2219, 4096; // begin inline asm st.shared.b32 [%r2221], %r2218; // end inline asm xor.b32 %r2225, %r2201, 64; mov.b32 %f1172, %r2904; mov.b32 %f1173, %r2905; // begin inline asm cvt.rn.f16x2.f32 %r2223, %f1172, %f1173; // end inline asm mov.b32 %f1174, %r2902; mov.b32 %f1175, %r2903; // begin inline asm cvt.rn.f16x2.f32 %r2224, %f1174, %f1175; // end inline asm // begin inline asm st.shared.b32 [%r2225], %r2223; // end inline asm add.s32 %r2227, %r2225, 4096; // begin inline asm st.shared.b32 [%r2227], %r2224; // end inline asm xor.b32 %r2231, %r2201, 80; mov.b32 %f1176, %r2900; mov.b32 %f1177, %r2901; // begin inline asm cvt.rn.f16x2.f32 %r2229, %f1176, %f1177; // end inline asm mov.b32 %f1178, %r2898; mov.b32 %f1179, %r2899; // begin inline asm cvt.rn.f16x2.f32 %r2230, %f1178, %f1179; // end inline asm // begin inline asm st.shared.b32 [%r2231], %r2229; // end inline asm add.s32 %r2233, %r2231, 4096; // begin inline asm st.shared.b32 [%r2233], %r2230; // end inline asm xor.b32 %r2237, %r2201, 96; mov.b32 %f1180, %r2896; mov.b32 %f1181, %r2897; // begin inline asm cvt.rn.f16x2.f32 %r2235, %f1180, %f1181; // end inline asm mov.b32 %f1182, %r2894; mov.b32 %f1183, %r2895; // begin inline asm cvt.rn.f16x2.f32 %r2236, %f1182, %f1183; // end inline asm // begin inline asm st.shared.b32 [%r2237], %r2235; // end inline asm add.s32 %r2239, %r2237, 4096; // begin inline asm st.shared.b32 [%r2239], %r2236; // end inline asm xor.b32 %r2243, %r2201, 112; mov.b32 %f1184, %r2892; mov.b32 %f1185, %r2893; // begin inline asm cvt.rn.f16x2.f32 %r2241, %f1184, %f1185; // end inline asm mov.b32 %f1186, %r2890; mov.b32 %f1187, %r2891; // begin inline asm cvt.rn.f16x2.f32 %r2242, %f1186, %f1187; // end inline asm // begin inline asm st.shared.b32 [%r2243], %r2241; // end inline asm add.s32 %r2245, %r2243, 4096; // begin inline asm st.shared.b32 [%r2245], %r2242; // end inline asm xor.b32 %r2249, %r2201, 128; mov.b32 %f1188, %r2888; mov.b32 %f1189, %r2889; // begin inline asm cvt.rn.f16x2.f32 %r2247, %f1188, %f1189; // end inline asm mov.b32 %f1190, %r2886; mov.b32 %f1191, %r2887; // begin inline asm cvt.rn.f16x2.f32 %r2248, %f1190, %f1191; // end inline asm // begin inline asm st.shared.b32 [%r2249], %r2247; // end inline asm add.s32 %r2251, %r2249, 4096; // begin inline asm st.shared.b32 [%r2251], %r2248; // end inline asm xor.b32 %r2255, %r2201, 144; mov.b32 %f1192, %r2884; mov.b32 %f1193, %r2885; // begin inline asm cvt.rn.f16x2.f32 %r2253, %f1192, %f1193; // end inline asm mov.b32 %f1194, %r2882; mov.b32 %f1195, %r2883; // begin inline asm cvt.rn.f16x2.f32 %r2254, %f1194, %f1195; // end inline asm // begin inline asm st.shared.b32 [%r2255], %r2253; // end inline asm add.s32 %r2257, %r2255, 4096; // begin inline asm st.shared.b32 [%r2257], %r2254; // end inline asm xor.b32 %r2261, %r2201, 160; mov.b32 %f1196, %r2880; mov.b32 %f1197, %r2881; // begin inline asm cvt.rn.f16x2.f32 %r2259, %f1196, %f1197; // end inline asm mov.b32 %f1198, %r2878; mov.b32 %f1199, %r2879; // begin inline asm cvt.rn.f16x2.f32 %r2260, %f1198, %f1199; // end inline asm // begin inline asm st.shared.b32 [%r2261], %r2259; // end inline asm add.s32 %r2263, %r2261, 4096; // begin inline asm st.shared.b32 [%r2263], %r2260; // end inline asm xor.b32 %r2267, %r2201, 176; mov.b32 %f1200, %r2876; mov.b32 %f1201, %r2877; // begin inline asm cvt.rn.f16x2.f32 %r2265, %f1200, %f1201; // end inline asm mov.b32 %f1202, %r2874; mov.b32 %f1203, %r2875; // begin inline asm cvt.rn.f16x2.f32 %r2266, %f1202, %f1203; // end inline asm // begin inline asm st.shared.b32 [%r2267], %r2265; // end inline asm add.s32 %r2269, %r2267, 4096; // begin inline asm st.shared.b32 [%r2269], %r2266; // end inline asm xor.b32 %r2273, %r2201, 192; mov.b32 %f1204, %r2872; mov.b32 %f1205, %r2873; // begin inline asm cvt.rn.f16x2.f32 %r2271, %f1204, %f1205; // end inline asm mov.b32 %f1206, %r2870; mov.b32 %f1207, %r2871; // begin inline asm cvt.rn.f16x2.f32 %r2272, %f1206, %f1207; // end inline asm // begin inline asm st.shared.b32 [%r2273], %r2271; // end inline asm add.s32 %r2275, %r2273, 4096; // begin inline asm st.shared.b32 [%r2275], %r2272; // end inline asm xor.b32 %r2279, %r2201, 208; mov.b32 %f1208, %r2868; mov.b32 %f1209, %r2869; // begin inline asm cvt.rn.f16x2.f32 %r2277, %f1208, %f1209; // end inline asm mov.b32 %f1210, %r2866; mov.b32 %f1211, %r2867; // begin inline asm cvt.rn.f16x2.f32 %r2278, %f1210, %f1211; // end inline asm // begin inline asm st.shared.b32 [%r2279], %r2277; // end inline asm add.s32 %r2281, %r2279, 4096; // begin inline asm st.shared.b32 [%r2281], %r2278; // end inline asm xor.b32 %r2285, %r2201, 224; mov.b32 %f1212, %r2864; mov.b32 %f1213, %r2865; // begin inline asm cvt.rn.f16x2.f32 %r2283, %f1212, %f1213; // end inline asm mov.b32 %f1214, %r2862; mov.b32 %f1215, %r2863; // begin inline asm cvt.rn.f16x2.f32 %r2284, %f1214, %f1215; // end inline asm // begin inline asm st.shared.b32 [%r2285], %r2283; // end inline asm add.s32 %r2287, %r2285, 4096; // begin inline asm st.shared.b32 [%r2287], %r2284; // end inline asm xor.b32 %r2291, %r2201, 240; mov.b32 %f1216, %r2860; mov.b32 %f1217, %r2861; // begin inline asm cvt.rn.f16x2.f32 %r2289, %f1216, %f1217; // end inline asm mov.b32 %f1218, %r2858; mov.b32 %f1219, %r2859; // begin inline asm cvt.rn.f16x2.f32 %r2290, %f1218, %f1219; // end inline asm // begin inline asm st.shared.b32 [%r2291], %r2289; // end inline asm add.s32 %r2293, %r2291, 4096; // begin inline asm st.shared.b32 [%r2293], %r2290; // end inline asm xor.b32 %r2297, %r2201, 256; mov.b32 %f1220, %r2856; mov.b32 %f1221, %r2857; // begin inline asm cvt.rn.f16x2.f32 %r2295, %f1220, %f1221; // end inline asm mov.b32 %f1222, %r2854; mov.b32 %f1223, %r2855; // begin inline asm cvt.rn.f16x2.f32 %r2296, %f1222, %f1223; // end inline asm // begin inline asm st.shared.b32 [%r2297], %r2295; // end inline asm add.s32 %r2299, %r2297, 4096; // begin inline asm st.shared.b32 [%r2299], %r2296; // end inline asm xor.b32 %r2303, %r2201, 272; mov.b32 %f1224, %r2852; mov.b32 %f1225, %r2853; // begin inline asm cvt.rn.f16x2.f32 %r2301, %f1224, %f1225; // end inline asm mov.b32 %f1226, %r2850; mov.b32 %f1227, %r2851; // begin inline asm cvt.rn.f16x2.f32 %r2302, %f1226, %f1227; // end inline asm // begin inline asm st.shared.b32 [%r2303], %r2301; // end inline asm add.s32 %r2305, %r2303, 4096; // begin inline asm st.shared.b32 [%r2305], %r2302; // end inline asm xor.b32 %r2309, %r2201, 288; mov.b32 %f1228, %r2848; mov.b32 %f1229, %r2849; // begin inline asm cvt.rn.f16x2.f32 %r2307, %f1228, %f1229; // end inline asm mov.b32 %f1230, %r2846; mov.b32 %f1231, %r2847; // begin inline asm cvt.rn.f16x2.f32 %r2308, %f1230, %f1231; // end inline asm // begin inline asm st.shared.b32 [%r2309], %r2307; // end inline asm add.s32 %r2311, %r2309, 4096; // begin inline asm st.shared.b32 [%r2311], %r2308; // end inline asm xor.b32 %r2315, %r2201, 304; mov.b32 %f1232, %r2844; mov.b32 %f1233, %r2845; // begin inline asm cvt.rn.f16x2.f32 %r2313, %f1232, %f1233; // end inline asm mov.b32 %f1234, %r2842; mov.b32 %f1235, %r2843; // begin inline asm cvt.rn.f16x2.f32 %r2314, %f1234, %f1235; // end inline asm // begin inline asm st.shared.b32 [%r2315], %r2313; // end inline asm add.s32 %r2317, %r2315, 4096; // begin inline asm st.shared.b32 [%r2317], %r2314; // end inline asm xor.b32 %r2321, %r2201, 320; mov.b32 %f1236, %r2840; mov.b32 %f1237, %r2841; // begin inline asm cvt.rn.f16x2.f32 %r2319, %f1236, %f1237; // end inline asm mov.b32 %f1238, %r2838; mov.b32 %f1239, %r2839; // begin inline asm cvt.rn.f16x2.f32 %r2320, %f1238, %f1239; // end inline asm // begin inline asm st.shared.b32 [%r2321], %r2319; // end inline asm add.s32 %r2323, %r2321, 4096; // begin inline asm st.shared.b32 [%r2323], %r2320; // end inline asm xor.b32 %r2327, %r2201, 336; mov.b32 %f1240, %r2836; mov.b32 %f1241, %r2837; // begin inline asm cvt.rn.f16x2.f32 %r2325, %f1240, %f1241; // end inline asm mov.b32 %f1242, %r2834; mov.b32 %f1243, %r2835; // begin inline asm cvt.rn.f16x2.f32 %r2326, %f1242, %f1243; // end inline asm // begin inline asm st.shared.b32 [%r2327], %r2325; // end inline asm add.s32 %r2329, %r2327, 4096; // begin inline asm st.shared.b32 [%r2329], %r2326; // end inline asm xor.b32 %r2333, %r2201, 352; mov.b32 %f1244, %r2832; mov.b32 %f1245, %r2833; // begin inline asm cvt.rn.f16x2.f32 %r2331, %f1244, %f1245; // end inline asm mov.b32 %f1246, %r2830; mov.b32 %f1247, %r2831; // begin inline asm cvt.rn.f16x2.f32 %r2332, %f1246, %f1247; // end inline asm // begin inline asm st.shared.b32 [%r2333], %r2331; // end inline asm add.s32 %r2335, %r2333, 4096; // begin inline asm st.shared.b32 [%r2335], %r2332; // end inline asm xor.b32 %r2339, %r2201, 368; mov.b32 %f1248, %r2828; mov.b32 %f1249, %r2829; // begin inline asm cvt.rn.f16x2.f32 %r2337, %f1248, %f1249; // end inline asm mov.b32 %f1250, %r2826; mov.b32 %f1251, %r2827; // begin inline asm cvt.rn.f16x2.f32 %r2338, %f1250, %f1251; // end inline asm // begin inline asm st.shared.b32 [%r2339], %r2337; // end inline asm add.s32 %r2341, %r2339, 4096; // begin inline asm st.shared.b32 [%r2341], %r2338; // end inline asm xor.b32 %r2345, %r2201, 384; mov.b32 %f1252, %r2824; mov.b32 %f1253, %r2825; // begin inline asm cvt.rn.f16x2.f32 %r2343, %f1252, %f1253; // end inline asm mov.b32 %f1254, %r2822; mov.b32 %f1255, %r2823; // begin inline asm cvt.rn.f16x2.f32 %r2344, %f1254, %f1255; // end inline asm // begin inline asm st.shared.b32 [%r2345], %r2343; // end inline asm add.s32 %r2347, %r2345, 4096; // begin inline asm st.shared.b32 [%r2347], %r2344; // end inline asm xor.b32 %r2351, %r2201, 400; mov.b32 %f1256, %r2820; mov.b32 %f1257, %r2821; // begin inline asm cvt.rn.f16x2.f32 %r2349, %f1256, %f1257; // end inline asm mov.b32 %f1258, %r2818; mov.b32 %f1259, %r2819; // begin inline asm cvt.rn.f16x2.f32 %r2350, %f1258, %f1259; // end inline asm // begin inline asm st.shared.b32 [%r2351], %r2349; // end inline asm add.s32 %r2353, %r2351, 4096; // begin inline asm st.shared.b32 [%r2353], %r2350; // end inline asm xor.b32 %r2357, %r2201, 416; mov.b32 %f1260, %r2816; mov.b32 %f1261, %r2817; // begin inline asm cvt.rn.f16x2.f32 %r2355, %f1260, %f1261; // end inline asm mov.b32 %f1262, %r2814; mov.b32 %f1263, %r2815; // begin inline asm cvt.rn.f16x2.f32 %r2356, %f1262, %f1263; // end inline asm // begin inline asm st.shared.b32 [%r2357], %r2355; // end inline asm add.s32 %r2359, %r2357, 4096; // begin inline asm st.shared.b32 [%r2359], %r2356; // end inline asm xor.b32 %r2363, %r2201, 432; mov.b32 %f1264, %r2812; mov.b32 %f1265, %r2813; // begin inline asm cvt.rn.f16x2.f32 %r2361, %f1264, %f1265; // end inline asm mov.b32 %f1266, %r2810; mov.b32 %f1267, %r2811; // begin inline asm cvt.rn.f16x2.f32 %r2362, %f1266, %f1267; // end inline asm // begin inline asm st.shared.b32 [%r2363], %r2361; // end inline asm add.s32 %r2365, %r2363, 4096; // begin inline asm st.shared.b32 [%r2365], %r2362; // end inline asm xor.b32 %r2369, %r2201, 448; mov.b32 %f1268, %r2808; mov.b32 %f1269, %r2809; // begin inline asm cvt.rn.f16x2.f32 %r2367, %f1268, %f1269; // end inline asm mov.b32 %f1270, %r2806; mov.b32 %f1271, %r2807; // begin inline asm cvt.rn.f16x2.f32 %r2368, %f1270, %f1271; // end inline asm // begin inline asm st.shared.b32 [%r2369], %r2367; // end inline asm add.s32 %r2371, %r2369, 4096; // begin inline asm st.shared.b32 [%r2371], %r2368; // end inline asm xor.b32 %r2375, %r2201, 464; mov.b32 %f1272, %r2804; mov.b32 %f1273, %r2805; // begin inline asm cvt.rn.f16x2.f32 %r2373, %f1272, %f1273; // end inline asm mov.b32 %f1274, %r2802; mov.b32 %f1275, %r2803; // begin inline asm cvt.rn.f16x2.f32 %r2374, %f1274, %f1275; // end inline asm // begin inline asm st.shared.b32 [%r2375], %r2373; // end inline asm add.s32 %r2377, %r2375, 4096; // begin inline asm st.shared.b32 [%r2377], %r2374; // end inline asm xor.b32 %r2381, %r2201, 480; mov.b32 %f1276, %r2800; mov.b32 %f1277, %r2801; // begin inline asm cvt.rn.f16x2.f32 %r2379, %f1276, %f1277; // end inline asm mov.b32 %f1278, %r2798; mov.b32 %f1279, %r2799; // begin inline asm cvt.rn.f16x2.f32 %r2380, %f1278, %f1279; // end inline asm // begin inline asm st.shared.b32 [%r2381], %r2379; // end inline asm add.s32 %r2383, %r2381, 4096; // begin inline asm st.shared.b32 [%r2383], %r2380; // end inline asm xor.b32 %r2387, %r2201, 496; mov.b32 %f1280, %r2796; mov.b32 %f1281, %r2797; // begin inline asm cvt.rn.f16x2.f32 %r2385, %f1280, %f1281; // end inline asm mov.b32 %f1282, %r2794; mov.b32 %f1283, %r2795; // begin inline asm cvt.rn.f16x2.f32 %r2386, %f1282, %f1283; // end inline asm // begin inline asm st.shared.b32 [%r2387], %r2385; // end inline asm add.s32 %r2389, %r2387, 4096; // begin inline asm st.shared.b32 [%r2389], %r2386; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r2391, %r2392, %r2393, %r2394}, [%r2395]; // end inline asm xor.b32 %r2483, %r2395, 64; add.s32 %r2400, %r2483, 2048; // begin inline asm ld.shared.v4.b32 {%r2396, %r2397, %r2398, %r2399}, [%r2400]; // end inline asm add.s32 %r2405, %r2395, 4096; // begin inline asm ld.shared.v4.b32 {%r2401, %r2402, %r2403, %r2404}, [%r2405]; // end inline asm add.s32 %r2410, %r2483, 6144; // begin inline asm ld.shared.v4.b32 {%r2406, %r2407, %r2408, %r2409}, [%r2410]; // end inline asm add.s32 %r2415, %r2395, 8192; // begin inline asm ld.shared.v4.b32 {%r2411, %r2412, %r2413, %r2414}, [%r2415]; // end inline asm add.s32 %r2420, %r2483, 10240; // begin inline asm ld.shared.v4.b32 {%r2416, %r2417, %r2418, %r2419}, [%r2420]; // end inline asm add.s32 %r2425, %r2395, 12288; // begin inline asm ld.shared.v4.b32 {%r2421, %r2422, %r2423, %r2424}, [%r2425]; // end inline asm add.s32 %r2430, %r2483, 14336; // begin inline asm ld.shared.v4.b32 {%r2426, %r2427, %r2428, %r2429}, [%r2430]; // end inline asm add.s32 %r2435, %r2395, 16384; // begin inline asm ld.shared.v4.b32 {%r2431, %r2432, %r2433, %r2434}, [%r2435]; // end inline asm add.s32 %r2440, %r2483, 18432; // begin inline asm ld.shared.v4.b32 {%r2436, %r2437, %r2438, %r2439}, [%r2440]; // end inline asm add.s32 %r2445, %r2395, 20480; // begin inline asm ld.shared.v4.b32 {%r2441, %r2442, %r2443, %r2444}, [%r2445]; // end inline asm add.s32 %r2450, %r2483, 22528; // begin inline asm ld.shared.v4.b32 {%r2446, %r2447, %r2448, %r2449}, [%r2450]; // end inline asm add.s32 %r2455, %r2395, 24576; // begin inline asm ld.shared.v4.b32 {%r2451, %r2452, %r2453, %r2454}, [%r2455]; // end inline asm add.s32 %r2460, %r2483, 26624; // begin inline asm ld.shared.v4.b32 {%r2456, %r2457, %r2458, %r2459}, [%r2460]; // end inline asm add.s32 %r2465, %r2395, 28672; // begin inline asm ld.shared.v4.b32 {%r2461, %r2462, %r2463, %r2464}, [%r2465]; // end inline asm add.s32 %r2470, %r2483, 30720; // begin inline asm ld.shared.v4.b32 {%r2466, %r2467, %r2468, %r2469}, [%r2470]; // end inline asm mul.lo.s32 %r2488, %r2526, %r2528; shl.b32 %r2489, %r2488, 1; cvt.s64.s32 %rd130, %r2489; add.s64 %rd44, %rd130, %rd64; cvt.u32.u64 %r2491, %rd4; setp.ge.s32 %p95, %r2491, %r1; @%p95 bra $L__BB0_112; mov.b64 %rd214, fmha_v2_flash_attention_fp16_64_16_S_256_sm86_kernel_nl_param_0; mov.u64 %rd213, %rd214; ld.param.u32 %r2538, [%rd213+60]; shr.s32 %r2537, %r2538, 31; shr.u32 %r2536, %r2537, 29; add.s32 %r2535, %r2538, %r2536; shr.s32 %r2534, %r2535, 3; shr.s32 %r2533, %r878, 31; shr.u32 %r2532, %r2533, 27; add.s32 %r2531, %r878, %r2532; and.b32 %r2530, %r2531, -32; sub.s32 %r2529, %r878, %r2530; setp.ge.s32 %p96, %r2529, %r2534; @%p96 bra $L__BB0_68; mul.lo.s64 %rd132, %rd9, %rd4; add.s64 %rd133, %rd44, %rd132; cvta.to.global.u64 %rd134, %rd10; add.s64 %rd135, %rd134, %rd133; st.global.v4.u32 [%rd135], {%r2391, %r2392, %r2393, %r2394}; $L__BB0_68: add.s32 %r2493, %r2491, 4; setp.ge.s32 %p97, %r2493, %r1; @%p97 bra $L__BB0_112; @%p96 bra $L__BB0_71; add.s64 %rd136, %rd4, 4; mul.lo.s64 %rd137, %rd136, %rd9; add.s64 %rd138, %rd44, %rd137; cvta.to.global.u64 %rd139, %rd10; add.s64 %rd140, %rd139, %rd138; st.global.v4.u32 [%rd140], {%r2396, %r2397, %r2398, %r2399}; $L__BB0_71: add.s32 %r2495, %r2491, 8; setp.ge.s32 %p99, %r2495, %r1; @%p99 bra $L__BB0_112; @%p96 bra $L__BB0_74; add.s64 %rd141, %rd4, 8; mul.lo.s64 %rd142, %rd141, %rd9; add.s64 %rd143, %rd44, %rd142; cvta.to.global.u64 %rd144, %rd10; add.s64 %rd145, %rd144, %rd143; st.global.v4.u32 [%rd145], {%r2401, %r2402, %r2403, %r2404}; $L__BB0_74: add.s32 %r2497, %r2491, 12; setp.ge.s32 %p101, %r2497, %r1; @%p101 bra $L__BB0_112; @%p96 bra $L__BB0_77; add.s64 %rd146, %rd4, 12; mul.lo.s64 %rd147, %rd146, %rd9; add.s64 %rd148, %rd44, %rd147; cvta.to.global.u64 %rd149, %rd10; add.s64 %rd150, %rd149, %rd148; st.global.v4.u32 [%rd150], {%r2406, %r2407, %r2408, %r2409}; $L__BB0_77: add.s32 %r2499, %r2491, 16; setp.ge.s32 %p103, %r2499, %r1; @%p103 bra $L__BB0_112; @%p96 bra $L__BB0_80; add.s64 %rd151, %rd4, 16; mul.lo.s64 %rd152, %rd151, %rd9; add.s64 %rd153, %rd44, %rd152; cvta.to.global.u64 %rd154, %rd10; add.s64 %rd155, %rd154, %rd153; st.global.v4.u32 [%rd155], {%r2411, %r2412, %r2413, %r2414}; $L__BB0_80: add.s32 %r2501, %r2491, 20; setp.ge.s32 %p105, %r2501, %r1; @%p105 bra $L__BB0_112; @%p96 bra $L__BB0_83; add.s64 %rd156, %rd4, 20; mul.lo.s64 %rd157, %rd156, %rd9; add.s64 %rd158, %rd44, %rd157; cvta.to.global.u64 %rd159, %rd10; add.s64 %rd160, %rd159, %rd158; st.global.v4.u32 [%rd160], {%r2416, %r2417, %r2418, %r2419}; $L__BB0_83: add.s32 %r2503, %r2491, 24; setp.ge.s32 %p107, %r2503, %r1; @%p107 bra $L__BB0_112; @%p96 bra $L__BB0_86; add.s64 %rd161, %rd4, 24; mul.lo.s64 %rd162, %rd161, %rd9; add.s64 %rd163, %rd44, %rd162; cvta.to.global.u64 %rd164, %rd10; add.s64 %rd165, %rd164, %rd163; st.global.v4.u32 [%rd165], {%r2421, %r2422, %r2423, %r2424}; $L__BB0_86: add.s32 %r2505, %r2491, 28; setp.ge.s32 %p109, %r2505, %r1; @%p109 bra $L__BB0_112; @%p96 bra $L__BB0_89; add.s64 %rd166, %rd4, 28; mul.lo.s64 %rd167, %rd166, %rd9; add.s64 %rd168, %rd44, %rd167; cvta.to.global.u64 %rd169, %rd10; add.s64 %rd170, %rd169, %rd168; st.global.v4.u32 [%rd170], {%r2426, %r2427, %r2428, %r2429}; $L__BB0_89: add.s32 %r2507, %r2491, 32; setp.ge.s32 %p111, %r2507, %r1; @%p111 bra $L__BB0_112; @%p96 bra $L__BB0_92; add.s64 %rd171, %rd4, 32; mul.lo.s64 %rd172, %rd171, %rd9; add.s64 %rd173, %rd44, %rd172; cvta.to.global.u64 %rd174, %rd10; add.s64 %rd175, %rd174, %rd173; st.global.v4.u32 [%rd175], {%r2431, %r2432, %r2433, %r2434}; $L__BB0_92: add.s32 %r2509, %r2491, 36; setp.ge.s32 %p113, %r2509, %r1; @%p113 bra $L__BB0_112; @%p96 bra $L__BB0_95; add.s64 %rd176, %rd4, 36; mul.lo.s64 %rd177, %rd176, %rd9; add.s64 %rd178, %rd44, %rd177; cvta.to.global.u64 %rd179, %rd10; add.s64 %rd180, %rd179, %rd178; st.global.v4.u32 [%rd180], {%r2436, %r2437, %r2438, %r2439}; $L__BB0_95: add.s32 %r2511, %r2491, 40; setp.ge.s32 %p115, %r2511, %r1; @%p115 bra $L__BB0_112; @%p96 bra $L__BB0_98; add.s64 %rd181, %rd4, 40; mul.lo.s64 %rd182, %rd181, %rd9; add.s64 %rd183, %rd44, %rd182; cvta.to.global.u64 %rd184, %rd10; add.s64 %rd185, %rd184, %rd183; st.global.v4.u32 [%rd185], {%r2441, %r2442, %r2443, %r2444}; $L__BB0_98: add.s32 %r2513, %r2491, 44; setp.ge.s32 %p117, %r2513, %r1; @%p117 bra $L__BB0_112; @%p96 bra $L__BB0_101; add.s64 %rd186, %rd4, 44; mul.lo.s64 %rd187, %rd186, %rd9; add.s64 %rd188, %rd44, %rd187; cvta.to.global.u64 %rd189, %rd10; add.s64 %rd190, %rd189, %rd188; st.global.v4.u32 [%rd190], {%r2446, %r2447, %r2448, %r2449}; $L__BB0_101: add.s32 %r2515, %r2491, 48; setp.ge.s32 %p119, %r2515, %r1; @%p119 bra $L__BB0_112; @%p96 bra $L__BB0_104; add.s64 %rd191, %rd4, 48; mul.lo.s64 %rd192, %rd191, %rd9; add.s64 %rd193, %rd44, %rd192; cvta.to.global.u64 %rd194, %rd10; add.s64 %rd195, %rd194, %rd193; st.global.v4.u32 [%rd195], {%r2451, %r2452, %r2453, %r2454}; $L__BB0_104: add.s32 %r2517, %r2491, 52; setp.ge.s32 %p121, %r2517, %r1; @%p121 bra $L__BB0_112; @%p96 bra $L__BB0_107; add.s64 %rd196, %rd4, 52; mul.lo.s64 %rd197, %rd196, %rd9; add.s64 %rd198, %rd44, %rd197; cvta.to.global.u64 %rd199, %rd10; add.s64 %rd200, %rd199, %rd198; st.global.v4.u32 [%rd200], {%r2456, %r2457, %r2458, %r2459}; $L__BB0_107: add.s32 %r2519, %r2491, 56; setp.ge.s32 %p123, %r2519, %r1; @%p123 bra $L__BB0_112; @%p96 bra $L__BB0_110; add.s64 %rd201, %rd4, 56; mul.lo.s64 %rd202, %rd201, %rd9; add.s64 %rd203, %rd44, %rd202; cvta.to.global.u64 %rd204, %rd10; add.s64 %rd205, %rd204, %rd203; st.global.v4.u32 [%rd205], {%r2461, %r2462, %r2463, %r2464}; $L__BB0_110: add.s32 %r2521, %r2491, 60; setp.ge.s32 %p125, %r2521, %r1; or.pred %p127, %p125, %p96; @%p127 bra $L__BB0_112; add.s64 %rd206, %rd4, 60; mul.lo.s64 %rd207, %rd206, %rd9; add.s64 %rd208, %rd44, %rd207; cvta.to.global.u64 %rd209, %rd10; add.s64 %rd210, %rd209, %rd208; st.global.v4.u32 [%rd210], {%r2466, %r2467, %r2468, %r2469}; $L__BB0_112: ret; }