Compiler Build ID: CL-31678015 // Cuda compilation tools, release 11.8, V11.8.85 // Based on NVVM 7.0.1 // .version 7.8 .target sm_60 .address_size 64 // .globl MVNKernel_fp16_large // _ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE4temp has been demoted // _ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE2mu has been demoted // _ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE6rsigma has been demoted .visible .entry MVNKernel_fp16_large( .param .align 8 .b8 MVNKernel_fp16_large_param_0[88] ) { .reg .pred %p<94>; .reg .b16 %rs<188>; .reg .f32 %f<165>; .reg .b32 %r<275>; .reg .b64 %rd<99>; // demoted variable .shared .align 4 .b8 _ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE4temp[128]; // demoted variable .shared .align 4 .f32 _ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE2mu; // demoted variable .shared .align 4 .f32 _ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE6rsigma; ld.param.v2.u8 {%rs66, %rs67}, [MVNKernel_fp16_large_param_0+48]; ld.param.v2.u32 {%r125, %r126}, [MVNKernel_fp16_large_param_0+56]; ld.param.v2.u32 {%r127, %r128}, [MVNKernel_fp16_large_param_0+64]; ld.param.v2.u32 {%r129, %r130}, [MVNKernel_fp16_large_param_0+72]; ld.param.v4.u8 {%rs68, %rs69, %rs70, %rs71}, [MVNKernel_fp16_large_param_0+84]; ld.param.v4.u8 {%rs72, %rs73, %rs74, %rs75}, [MVNKernel_fp16_large_param_0+44]; ld.param.u32 %r124, [MVNKernel_fp16_large_param_0+80]; ld.param.u8 %rs61, [MVNKernel_fp16_large_param_0+50]; ld.param.f32 %f47, [MVNKernel_fp16_large_param_0+40]; ld.param.f32 %f46, [MVNKernel_fp16_large_param_0+36]; ld.param.u32 %r116, [MVNKernel_fp16_large_param_0+32]; ld.param.u64 %rd43, [MVNKernel_fp16_large_param_0+24]; ld.param.u64 %rd42, [MVNKernel_fp16_large_param_0+16]; ld.param.u64 %rd41, [MVNKernel_fp16_large_param_0+8]; ld.param.u64 %rd40, [MVNKernel_fp16_large_param_0]; cvta.to.global.u64 %rd1, %rd40; cvta.to.global.u64 %rd2, %rd41; cvta.to.global.u64 %rd3, %rd42; cvta.to.global.u64 %rd4, %rd43; mov.u32 %r1, %ctaid.x; mul.lo.s32 %r3, %r116, %r1; setp.eq.s16 %p1, %rs72, 0; mov.f32 %f153, 0f00000000; mov.u32 %r272, %tid.x; @%p1 bra $L__BB0_22; setp.ge.s32 %p2, %r272, %r116; mov.f32 %f147, 0f00000000; @%p2 bra $L__BB0_8; not.b32 %r131, %r272; add.s32 %r5, %r116, %r131; shr.u32 %r132, %r5, 10; add.s32 %r133, %r132, 1; and.b32 %r244, %r133, 3; setp.eq.s32 %p3, %r244, 0; mov.f32 %f147, 0f00000000; mov.u32 %r245, %r272; @%p3 bra $L__BB0_5; add.s32 %r134, %r272, %r3; mul.wide.s32 %rd44, %r134, 2; add.s64 %rd89, %rd1, %rd44; mov.u32 %r245, %r272; $L__BB0_4: .pragma "nounroll"; ld.global.u16 %rs76, [%rd89]; // begin inline asm { cvt.f32.f16 %f53, %rs76;} // end inline asm add.ftz.f32 %f147, %f147, %f53; add.s32 %r245, %r245, 1024; add.s64 %rd89, %rd89, 2048; add.s32 %r244, %r244, -1; setp.ne.s32 %p4, %r244, 0; @%p4 bra $L__BB0_4; $L__BB0_5: setp.lt.u32 %p5, %r5, 3072; @%p5 bra $L__BB0_8; add.s32 %r135, %r245, %r3; mul.wide.s32 %rd45, %r135, 2; add.s64 %rd46, %rd1, %rd45; add.s64 %rd90, %rd46, 4096; $L__BB0_7: ld.global.u16 %rs77, [%rd90+-4096]; // begin inline asm { cvt.f32.f16 %f54, %rs77;} // end inline asm add.ftz.f32 %f58, %f147, %f54; ld.global.u16 %rs78, [%rd90+-2048]; // begin inline asm { cvt.f32.f16 %f55, %rs78;} // end inline asm add.ftz.f32 %f59, %f58, %f55; ld.global.u16 %rs79, [%rd90]; // begin inline asm { cvt.f32.f16 %f56, %rs79;} // end inline asm add.ftz.f32 %f60, %f59, %f56; ld.global.u16 %rs80, [%rd90+2048]; // begin inline asm { cvt.f32.f16 %f57, %rs80;} // end inline asm add.ftz.f32 %f147, %f60, %f57; add.s64 %rd90, %rd90, 8192; add.s32 %r245, %r245, 4096; setp.lt.s32 %p6, %r245, %r116; @%p6 bra $L__BB0_7; $L__BB0_8: mul.ftz.f32 %f152, %f46, %f147; mov.u32 %r248, WARP_SZ; setp.lt.s32 %p7, %r248, 2; @%p7 bra $L__BB0_11; mov.u32 %r247, %r248; $L__BB0_10: mov.b32 %r136, %f152; shr.u32 %r137, %r247, 31; add.s32 %r138, %r247, %r137; shr.s32 %r16, %r138, 1; mov.u32 %r139, 31; mov.u32 %r140, -1; shfl.sync.down.b32 %r141|%p8, %r136, %r16, %r139, %r140; mov.b32 %f61, %r141; add.ftz.f32 %f152, %f152, %f61; setp.gt.s32 %p9, %r247, 3; mov.u32 %r247, %r16; @%p9 bra $L__BB0_10; $L__BB0_11: rem.u32 %r17, %r272, %r248; setp.ne.s32 %p10, %r17, 0; @%p10 bra $L__BB0_13; div.u32 %r142, %r272, %r248; shl.b32 %r143, %r142, 2; mov.u32 %r144, _ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r145, %r144, %r143; st.shared.f32 [%r145], %f152; $L__BB0_13: bar.sync 0; setp.le.u32 %p11, %r248, %r272; @%p11 bra $L__BB0_19; mov.u32 %r146, %ntid.x; div.u32 %r147, %r146, %r248; setp.ge.s32 %p12, %r17, %r147; mov.f32 %f152, 0f00000000; @%p12 bra $L__BB0_16; shl.b32 %r148, %r17, 2; mov.u32 %r149, _ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r150, %r149, %r148; ld.shared.f32 %f152, [%r150]; $L__BB0_16: @%p7 bra $L__BB0_19; $L__BB0_18: mov.b32 %r151, %f152; shr.u32 %r152, %r248, 31; add.s32 %r153, %r248, %r152; shr.s32 %r19, %r153, 1; mov.u32 %r154, 31; mov.u32 %r155, -1; shfl.sync.down.b32 %r156|%p14, %r151, %r19, %r154, %r155; mov.b32 %f63, %r156; add.ftz.f32 %f152, %f152, %f63; setp.gt.s32 %p15, %r248, 3; mov.u32 %r248, %r19; @%p15 bra $L__BB0_18; $L__BB0_19: bar.sync 0; setp.ne.s32 %p16, %r272, 0; @%p16 bra $L__BB0_21; st.shared.f32 [_ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE2mu], %f152; $L__BB0_21: bar.sync 0; ld.shared.f32 %f153, [_ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE2mu]; $L__BB0_22: setp.eq.s16 %p17, %rs73, 0; mov.f32 %f164, 0f3F800000; @%p17 bra $L__BB0_44; setp.ge.s32 %p18, %r272, %r116; mov.f32 %f158, 0f00000000; @%p18 bra $L__BB0_30; not.b32 %r157, %r272; add.s32 %r20, %r116, %r157; shr.u32 %r158, %r20, 10; add.s32 %r159, %r158, 1; and.b32 %r250, %r159, 3; setp.eq.s32 %p19, %r250, 0; mov.f32 %f158, 0f00000000; mov.u32 %r251, %r272; @%p19 bra $L__BB0_27; add.s32 %r160, %r272, %r3; mul.wide.s32 %rd47, %r160, 2; add.s64 %rd91, %rd1, %rd47; mov.u32 %r251, %r272; $L__BB0_26: .pragma "nounroll"; ld.global.u16 %rs82, [%rd91]; // begin inline asm { cvt.f32.f16 %f69, %rs82;} // end inline asm sub.ftz.f32 %f70, %f69, %f153; fma.rn.ftz.f32 %f158, %f70, %f70, %f158; add.s32 %r251, %r251, 1024; add.s64 %rd91, %rd91, 2048; add.s32 %r250, %r250, -1; setp.ne.s32 %p20, %r250, 0; @%p20 bra $L__BB0_26; $L__BB0_27: setp.lt.u32 %p21, %r20, 3072; @%p21 bra $L__BB0_30; add.s32 %r161, %r251, %r3; mul.wide.s32 %rd48, %r161, 2; add.s64 %rd49, %rd1, %rd48; add.s64 %rd92, %rd49, 4096; $L__BB0_29: ld.global.u16 %rs83, [%rd92+-4096]; // begin inline asm { cvt.f32.f16 %f71, %rs83;} // end inline asm sub.ftz.f32 %f75, %f71, %f153; fma.rn.ftz.f32 %f76, %f75, %f75, %f158; ld.global.u16 %rs84, [%rd92+-2048]; // begin inline asm { cvt.f32.f16 %f72, %rs84;} // end inline asm sub.ftz.f32 %f77, %f72, %f153; fma.rn.ftz.f32 %f78, %f77, %f77, %f76; ld.global.u16 %rs85, [%rd92]; // begin inline asm { cvt.f32.f16 %f73, %rs85;} // end inline asm sub.ftz.f32 %f79, %f73, %f153; fma.rn.ftz.f32 %f80, %f79, %f79, %f78; ld.global.u16 %rs86, [%rd92+2048]; // begin inline asm { cvt.f32.f16 %f74, %rs86;} // end inline asm sub.ftz.f32 %f81, %f74, %f153; fma.rn.ftz.f32 %f158, %f81, %f81, %f80; add.s64 %rd92, %rd92, 8192; add.s32 %r251, %r251, 4096; setp.lt.s32 %p22, %r251, %r116; @%p22 bra $L__BB0_29; $L__BB0_30: mul.ftz.f32 %f163, %f46, %f158; mov.u32 %r254, WARP_SZ; setp.lt.s32 %p23, %r254, 2; @%p23 bra $L__BB0_33; mov.u32 %r253, %r254; $L__BB0_32: mov.b32 %r162, %f163; shr.u32 %r163, %r253, 31; add.s32 %r164, %r253, %r163; shr.s32 %r31, %r164, 1; mov.u32 %r165, 31; mov.u32 %r166, -1; shfl.sync.down.b32 %r167|%p24, %r162, %r31, %r165, %r166; mov.b32 %f82, %r167; add.ftz.f32 %f163, %f163, %f82; setp.gt.s32 %p25, %r253, 3; mov.u32 %r253, %r31; @%p25 bra $L__BB0_32; $L__BB0_33: rem.u32 %r32, %r272, %r254; setp.ne.s32 %p26, %r32, 0; @%p26 bra $L__BB0_35; div.u32 %r168, %r272, %r254; shl.b32 %r169, %r168, 2; mov.u32 %r170, _ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r171, %r170, %r169; st.shared.f32 [%r171], %f163; $L__BB0_35: bar.sync 0; setp.le.u32 %p27, %r254, %r272; @%p27 bra $L__BB0_41; mov.u32 %r172, %ntid.x; div.u32 %r173, %r172, %r254; setp.ge.s32 %p28, %r32, %r173; mov.f32 %f163, 0f00000000; @%p28 bra $L__BB0_38; shl.b32 %r174, %r32, 2; mov.u32 %r175, _ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r176, %r175, %r174; ld.shared.f32 %f163, [%r176]; $L__BB0_38: @%p23 bra $L__BB0_41; $L__BB0_40: mov.b32 %r177, %f163; shr.u32 %r178, %r254, 31; add.s32 %r179, %r254, %r178; shr.s32 %r34, %r179, 1; mov.u32 %r180, 31; mov.u32 %r181, -1; shfl.sync.down.b32 %r182|%p30, %r177, %r34, %r180, %r181; mov.b32 %f84, %r182; add.ftz.f32 %f163, %f163, %f84; setp.gt.s32 %p31, %r254, 3; mov.u32 %r254, %r34; @%p31 bra $L__BB0_40; $L__BB0_41: bar.sync 0; setp.ne.s32 %p32, %r272, 0; @%p32 bra $L__BB0_43; add.ftz.f32 %f85, %f47, %f163; rsqrt.approx.ftz.f32 %f86, %f85; st.shared.f32 [_ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE6rsigma], %f86; $L__BB0_43: bar.sync 0; ld.shared.f32 %f164, [_ZZ20LayerNormLargeKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE6rsigma]; $L__BB0_44: setp.ge.s32 %p33, %r272, %r116; @%p33 bra $L__BB0_120; setp.eq.s16 %p34, %rs61, 0; setp.ne.s16 %p35, %rs67, 0; selp.b32 %r37, %r1, 0, %p35; mul.lo.s32 %r38, %r127, %r125; mul.lo.s32 %r39, %r126, %r127; not.b32 %r183, %r272; add.s32 %r40, %r116, %r183; @%p34 bra $L__BB0_84; and.b32 %r184, %r40, 1024; setp.ne.s32 %p36, %r184, 0; @%p36 bra $L__BB0_59; setp.eq.s16 %p37, %rs69, 0; @%p37 bra $L__BB0_51; setp.eq.s16 %p38, %rs68, 0; @%p38 bra $L__BB0_50; div.u32 %r256, %r272, %r39; mul.lo.s32 %r185, %r256, %r39; sub.s32 %r186, %r272, %r185; div.u32 %r257, %r186, %r127; mul.lo.s32 %r187, %r257, %r127; sub.s32 %r258, %r186, %r187; mov.u32 %r255, %r37; bra.uni $L__BB0_52; $L__BB0_84: shr.u32 %r227, %r40, 10; add.s32 %r228, %r227, 1; and.b32 %r271, %r228, 3; setp.eq.s32 %p60, %r271, 0; @%p60 bra $L__BB0_93; add.s32 %r269, %r272, %r3; mul.wide.s32 %rd67, %r269, 2; add.s64 %rd96, %rd4, %rd67; add.s64 %rd95, %rd1, %rd67; $L__BB0_86: .pragma "nounroll"; selp.b32 %r99, %r269, %r272, %p35; setp.eq.s16 %p62, %rs75, 0; @%p62 bra $L__BB0_88; mul.wide.u32 %rd68, %r99, 2; add.s64 %rd69, %rd2, %rd68; ld.global.u16 %rs178, [%rd69]; bra.uni $L__BB0_89; $L__BB0_88: mov.u32 %r229, 1; // begin inline asm cvt.rn.f16.s32 %rs178, %r229; // end inline asm $L__BB0_89: // begin inline asm { cvt.f32.f16 %f108, %rs178;} // end inline asm setp.eq.s16 %p63, %rs74, 0; @%p63 bra $L__BB0_91; mul.wide.u32 %rd70, %r99, 2; add.s64 %rd71, %rd3, %rd70; ld.global.u16 %rs179, [%rd71]; bra.uni $L__BB0_92; $L__BB0_91: mov.u32 %r230, 0; // begin inline asm cvt.rn.f16.s32 %rs179, %r230; // end inline asm $L__BB0_92: // begin inline asm { cvt.f32.f16 %f109, %rs179;} // end inline asm ld.global.u16 %rs129, [%rd95]; // begin inline asm { cvt.f32.f16 %f110, %rs129;} // end inline asm sub.ftz.f32 %f112, %f110, %f153; mul.ftz.f32 %f113, %f164, %f112; fma.rn.ftz.f32 %f114, %f108, %f113, %f109; setp.lt.ftz.f32 %p64, %f114, 0f00000000; setp.ne.s16 %p65, %rs66, 0; and.pred %p66, %p65, %p64; selp.f32 %f111, 0f00000000, %f114, %p66; // begin inline asm { cvt.rn.f16.f32 %rs130, %f111;} // end inline asm st.global.u16 [%rd96], %rs130; add.s32 %r272, %r272, 1024; add.s64 %rd96, %rd96, 2048; add.s64 %rd95, %rd95, 2048; add.s32 %r269, %r269, 1024; add.s32 %r271, %r271, -1; setp.ne.s32 %p67, %r271, 0; @%p67 bra $L__BB0_86; $L__BB0_93: setp.lt.u32 %p68, %r40, 3072; @%p68 bra $L__BB0_120; add.s32 %r231, %r272, %r3; add.s32 %r273, %r231, 3072; mul.wide.s32 %rd72, %r231, 2; add.s64 %rd97, %rd1, %rd72; add.s64 %rd98, %rd4, %rd72; $L__BB0_95: add.s32 %r232, %r3, %r272; selp.b32 %r107, %r232, %r272, %p35; setp.eq.s16 %p70, %rs75, 0; @%p70 bra $L__BB0_97; mul.wide.u32 %rd73, %r107, 2; add.s64 %rd74, %rd2, %rd73; ld.global.u16 %rs180, [%rd74]; bra.uni $L__BB0_98; $L__BB0_97: mov.u32 %r233, 1; // begin inline asm cvt.rn.f16.s32 %rs180, %r233; // end inline asm $L__BB0_98: // begin inline asm { cvt.f32.f16 %f115, %rs180;} // end inline asm setp.eq.s16 %p71, %rs74, 0; @%p71 bra $L__BB0_100; mul.wide.u32 %rd75, %r107, 2; add.s64 %rd76, %rd3, %rd75; ld.global.u16 %rs181, [%rd76]; bra.uni $L__BB0_101; $L__BB0_100: mov.u32 %r234, 0; // begin inline asm cvt.rn.f16.s32 %rs181, %r234; // end inline asm $L__BB0_101: // begin inline asm { cvt.f32.f16 %f116, %rs181;} // end inline asm ld.global.u16 %rs139, [%rd97]; // begin inline asm { cvt.f32.f16 %f117, %rs139;} // end inline asm sub.ftz.f32 %f119, %f117, %f153; mul.ftz.f32 %f120, %f164, %f119; fma.rn.ftz.f32 %f121, %f115, %f120, %f116; setp.lt.ftz.f32 %p72, %f121, 0f00000000; setp.ne.s16 %p73, %rs66, 0; and.pred %p74, %p73, %p72; selp.f32 %f118, 0f00000000, %f121, %p74; // begin inline asm { cvt.rn.f16.f32 %rs140, %f118;} // end inline asm st.global.u16 [%rd98], %rs140; add.s32 %r108, %r272, 1024; add.s32 %r235, %r273, -2048; selp.b32 %r109, %r235, %r108, %p35; @%p70 bra $L__BB0_103; mul.wide.u32 %rd77, %r109, 2; add.s64 %rd78, %rd2, %rd77; ld.global.u16 %rs182, [%rd78]; bra.uni $L__BB0_104; $L__BB0_103: mov.u32 %r236, 1; // begin inline asm cvt.rn.f16.s32 %rs182, %r236; // end inline asm $L__BB0_104: // begin inline asm { cvt.f32.f16 %f122, %rs182;} // end inline asm @%p71 bra $L__BB0_106; mul.wide.u32 %rd79, %r109, 2; add.s64 %rd80, %rd3, %rd79; ld.global.u16 %rs183, [%rd80]; bra.uni $L__BB0_107; $L__BB0_106: mov.u32 %r237, 0; // begin inline asm cvt.rn.f16.s32 %rs183, %r237; // end inline asm $L__BB0_107: // begin inline asm { cvt.f32.f16 %f123, %rs183;} // end inline asm ld.global.u16 %rs149, [%rd97+2048]; // begin inline asm { cvt.f32.f16 %f124, %rs149;} // end inline asm sub.ftz.f32 %f126, %f124, %f153; mul.ftz.f32 %f127, %f164, %f126; fma.rn.ftz.f32 %f128, %f122, %f127, %f123; setp.lt.ftz.f32 %p78, %f128, 0f00000000; and.pred %p80, %p73, %p78; selp.f32 %f125, 0f00000000, %f128, %p80; // begin inline asm { cvt.rn.f16.f32 %rs150, %f125;} // end inline asm st.global.u16 [%rd98+2048], %rs150; add.s32 %r110, %r108, 1024; add.s32 %r238, %r273, -1024; selp.b32 %r111, %r238, %r110, %p35; @%p70 bra $L__BB0_109; mul.wide.u32 %rd81, %r111, 2; add.s64 %rd82, %rd2, %rd81; ld.global.u16 %rs184, [%rd82]; bra.uni $L__BB0_110; $L__BB0_109: mov.u32 %r239, 1; // begin inline asm cvt.rn.f16.s32 %rs184, %r239; // end inline asm $L__BB0_110: // begin inline asm { cvt.f32.f16 %f129, %rs184;} // end inline asm @%p71 bra $L__BB0_112; mul.wide.u32 %rd83, %r111, 2; add.s64 %rd84, %rd3, %rd83; ld.global.u16 %rs185, [%rd84]; bra.uni $L__BB0_113; $L__BB0_112: mov.u32 %r240, 0; // begin inline asm cvt.rn.f16.s32 %rs185, %r240; // end inline asm $L__BB0_113: // begin inline asm { cvt.f32.f16 %f130, %rs185;} // end inline asm ld.global.u16 %rs159, [%rd97+4096]; // begin inline asm { cvt.f32.f16 %f131, %rs159;} // end inline asm sub.ftz.f32 %f133, %f131, %f153; mul.ftz.f32 %f134, %f164, %f133; fma.rn.ftz.f32 %f135, %f129, %f134, %f130; setp.lt.ftz.f32 %p84, %f135, 0f00000000; and.pred %p86, %p73, %p84; selp.f32 %f132, 0f00000000, %f135, %p86; // begin inline asm { cvt.rn.f16.f32 %rs160, %f132;} // end inline asm st.global.u16 [%rd98+4096], %rs160; add.s32 %r112, %r110, 1024; selp.b32 %r113, %r273, %r112, %p35; @%p70 bra $L__BB0_115; mul.wide.u32 %rd85, %r113, 2; add.s64 %rd86, %rd2, %rd85; ld.global.u16 %rs186, [%rd86]; bra.uni $L__BB0_116; $L__BB0_115: mov.u32 %r241, 1; // begin inline asm cvt.rn.f16.s32 %rs186, %r241; // end inline asm $L__BB0_116: // begin inline asm { cvt.f32.f16 %f136, %rs186;} // end inline asm @%p71 bra $L__BB0_118; mul.wide.u32 %rd87, %r113, 2; add.s64 %rd88, %rd3, %rd87; ld.global.u16 %rs187, [%rd88]; bra.uni $L__BB0_119; $L__BB0_118: mov.u32 %r242, 0; // begin inline asm cvt.rn.f16.s32 %rs187, %r242; // end inline asm $L__BB0_119: // begin inline asm { cvt.f32.f16 %f137, %rs187;} // end inline asm add.s64 %rd38, %rd97, 8192; ld.global.u16 %rs169, [%rd97+6144]; // begin inline asm { cvt.f32.f16 %f138, %rs169;} // end inline asm sub.ftz.f32 %f140, %f138, %f153; mul.ftz.f32 %f141, %f164, %f140; fma.rn.ftz.f32 %f142, %f136, %f141, %f137; setp.lt.ftz.f32 %p90, %f142, 0f00000000; and.pred %p92, %p73, %p90; selp.f32 %f139, 0f00000000, %f142, %p92; // begin inline asm { cvt.rn.f16.f32 %rs170, %f139;} // end inline asm add.s64 %rd39, %rd98, 8192; st.global.u16 [%rd98+6144], %rs170; add.s32 %r273, %r273, 4096; add.s32 %r272, %r112, 1024; setp.lt.s32 %p93, %r272, %r116; mov.u64 %rd97, %rd38; mov.u64 %rd98, %rd39; @%p93 bra $L__BB0_95; bra.uni $L__BB0_120; $L__BB0_51: div.u32 %r255, %r37, %r125; mul.lo.s32 %r191, %r255, %r125; sub.s32 %r256, %r37, %r191; div.u32 %r257, %r272, %r127; mul.lo.s32 %r192, %r257, %r127; sub.s32 %r258, %r272, %r192; bra.uni $L__BB0_52; $L__BB0_50: div.u32 %r257, %r272, %r38; mul.lo.s32 %r188, %r257, %r38; sub.s32 %r189, %r272, %r188; div.u32 %r258, %r189, %r125; mul.lo.s32 %r190, %r258, %r125; sub.s32 %r256, %r189, %r190; mov.u32 %r255, %r37; $L__BB0_52: mul.lo.s32 %r193, %r255, %r128; mad.lo.s32 %r194, %r256, %r129, %r193; mad.lo.s32 %r195, %r257, %r130, %r194; mad.lo.s32 %r196, %r258, %r124, %r195; cvt.u64.u32 %rd17, %r196; setp.eq.s16 %p39, %rs75, 0; @%p39 bra $L__BB0_54; shl.b64 %rd50, %rd17, 1; add.s64 %rd51, %rd2, %rd50; ld.global.u16 %rs172, [%rd51]; bra.uni $L__BB0_55; $L__BB0_54: mov.u32 %r197, 1; // begin inline asm cvt.rn.f16.s32 %rs172, %r197; // end inline asm $L__BB0_55: // begin inline asm { cvt.f32.f16 %f87, %rs172;} // end inline asm setp.eq.s16 %p40, %rs74, 0; @%p40 bra $L__BB0_57; shl.b64 %rd52, %rd17, 1; add.s64 %rd53, %rd3, %rd52; ld.global.u16 %rs173, [%rd53]; bra.uni $L__BB0_58; $L__BB0_57: mov.u32 %r198, 0; // begin inline asm cvt.rn.f16.s32 %rs173, %r198; // end inline asm $L__BB0_58: // begin inline asm { cvt.f32.f16 %f88, %rs173;} // end inline asm add.s32 %r199, %r272, %r3; mul.wide.s32 %rd54, %r199, 2; add.s64 %rd55, %rd1, %rd54; ld.global.u16 %rs97, [%rd55]; // begin inline asm { cvt.f32.f16 %f89, %rs97;} // end inline asm sub.ftz.f32 %f91, %f89, %f153; mul.ftz.f32 %f92, %f164, %f91; fma.rn.ftz.f32 %f93, %f87, %f92, %f88; setp.lt.ftz.f32 %p41, %f93, 0f00000000; setp.ne.s16 %p42, %rs66, 0; and.pred %p43, %p42, %p41; selp.f32 %f90, 0f00000000, %f93, %p43; // begin inline asm { cvt.rn.f16.f32 %rs98, %f90;} // end inline asm add.s64 %rd56, %rd4, %rd54; st.global.u16 [%rd56], %rs98; add.s32 %r272, %r272, 1024; $L__BB0_59: setp.lt.u32 %p44, %r40, 1024; @%p44 bra $L__BB0_120; add.s32 %r200, %r272, %r3; mul.wide.s32 %rd57, %r200, 2; add.s64 %rd58, %rd57, 2048; add.s64 %rd94, %rd4, %rd58; add.s64 %rd93, %rd1, %rd58; $L__BB0_61: setp.eq.s16 %p45, %rs69, 0; @%p45 bra $L__BB0_65; setp.eq.s16 %p46, %rs68, 0; @%p46 bra $L__BB0_64; div.u32 %r262, %r272, %r39; mul.lo.s32 %r201, %r262, %r39; sub.s32 %r202, %r272, %r201; div.u32 %r263, %r202, %r127; mul.lo.s32 %r203, %r263, %r127; sub.s32 %r264, %r202, %r203; mov.u32 %r261, %r37; bra.uni $L__BB0_66; $L__BB0_65: div.u32 %r261, %r37, %r125; mul.lo.s32 %r207, %r261, %r125; sub.s32 %r262, %r37, %r207; div.u32 %r263, %r272, %r127; mul.lo.s32 %r208, %r263, %r127; sub.s32 %r264, %r272, %r208; bra.uni $L__BB0_66; $L__BB0_64: div.u32 %r263, %r272, %r38; mul.lo.s32 %r204, %r263, %r38; sub.s32 %r205, %r272, %r204; div.u32 %r264, %r205, %r125; mul.lo.s32 %r206, %r264, %r125; sub.s32 %r262, %r205, %r206; mov.u32 %r261, %r37; $L__BB0_66: mul.lo.s32 %r209, %r261, %r128; mad.lo.s32 %r210, %r262, %r129, %r209; mad.lo.s32 %r211, %r263, %r130, %r210; mad.lo.s32 %r76, %r264, %r124, %r211; setp.eq.s16 %p47, %rs75, 0; @%p47 bra $L__BB0_68; mul.wide.u32 %rd59, %r76, 2; add.s64 %rd60, %rd2, %rd59; ld.global.u16 %rs174, [%rd60]; bra.uni $L__BB0_69; $L__BB0_68: mov.u32 %r212, 1; // begin inline asm cvt.rn.f16.s32 %rs174, %r212; // end inline asm $L__BB0_69: // begin inline asm { cvt.f32.f16 %f94, %rs174;} // end inline asm setp.eq.s16 %p48, %rs74, 0; @%p48 bra $L__BB0_71; mul.wide.u32 %rd61, %r76, 2; add.s64 %rd62, %rd3, %rd61; ld.global.u16 %rs175, [%rd62]; bra.uni $L__BB0_72; $L__BB0_71: mov.u32 %r213, 0; // begin inline asm cvt.rn.f16.s32 %rs175, %r213; // end inline asm $L__BB0_72: // begin inline asm { cvt.f32.f16 %f95, %rs175;} // end inline asm add.s64 %rd22, %rd93, -2048; ld.global.u16 %rs108, [%rd93+-2048]; // begin inline asm { cvt.f32.f16 %f96, %rs108;} // end inline asm sub.ftz.f32 %f98, %f96, %f153; mul.ftz.f32 %f99, %f164, %f98; fma.rn.ftz.f32 %f100, %f94, %f99, %f95; setp.lt.ftz.f32 %p49, %f100, 0f00000000; setp.ne.s16 %p50, %rs66, 0; and.pred %p51, %p50, %p49; selp.f32 %f97, 0f00000000, %f100, %p51; // begin inline asm { cvt.rn.f16.f32 %rs109, %f97;} // end inline asm add.s64 %rd23, %rd94, -2048; st.global.u16 [%rd94+-2048], %rs109; add.s32 %r77, %r272, 1024; @%p45 bra $L__BB0_76; setp.eq.s16 %p53, %rs68, 0; @%p53 bra $L__BB0_75; div.u32 %r266, %r77, %r39; mul.lo.s32 %r214, %r266, %r39; sub.s32 %r215, %r77, %r214; div.u32 %r267, %r215, %r127; mul.lo.s32 %r216, %r267, %r127; sub.s32 %r268, %r215, %r216; mov.u32 %r265, %r37; bra.uni $L__BB0_77; $L__BB0_76: div.u32 %r265, %r37, %r125; mul.lo.s32 %r220, %r265, %r125; sub.s32 %r266, %r37, %r220; div.u32 %r267, %r77, %r127; mul.lo.s32 %r221, %r267, %r127; sub.s32 %r268, %r77, %r221; bra.uni $L__BB0_77; $L__BB0_75: div.u32 %r267, %r77, %r38; mul.lo.s32 %r217, %r267, %r38; sub.s32 %r218, %r77, %r217; div.u32 %r268, %r218, %r125; mul.lo.s32 %r219, %r268, %r125; sub.s32 %r266, %r218, %r219; mov.u32 %r265, %r37; $L__BB0_77: mul.lo.s32 %r222, %r265, %r128; mad.lo.s32 %r223, %r266, %r129, %r222; mad.lo.s32 %r224, %r267, %r130, %r223; mad.lo.s32 %r92, %r268, %r124, %r224; @%p47 bra $L__BB0_79; mul.wide.u32 %rd63, %r92, 2; add.s64 %rd64, %rd2, %rd63; ld.global.u16 %rs176, [%rd64]; bra.uni $L__BB0_80; $L__BB0_79: mov.u32 %r225, 1; // begin inline asm cvt.rn.f16.s32 %rs176, %r225; // end inline asm $L__BB0_80: // begin inline asm { cvt.f32.f16 %f101, %rs176;} // end inline asm @%p48 bra $L__BB0_82; mul.wide.u32 %rd65, %r92, 2; add.s64 %rd66, %rd3, %rd65; ld.global.u16 %rs177, [%rd66]; bra.uni $L__BB0_83; $L__BB0_82: mov.u32 %r226, 0; // begin inline asm cvt.rn.f16.s32 %rs177, %r226; // end inline asm $L__BB0_83: // begin inline asm { cvt.f32.f16 %f102, %rs177;} // end inline asm ld.global.u16 %rs119, [%rd22+2048]; // begin inline asm { cvt.f32.f16 %f103, %rs119;} // end inline asm sub.ftz.f32 %f105, %f103, %f153; mul.ftz.f32 %f106, %f164, %f105; fma.rn.ftz.f32 %f107, %f101, %f106, %f102; setp.lt.ftz.f32 %p56, %f107, 0f00000000; and.pred %p58, %p50, %p56; selp.f32 %f104, 0f00000000, %f107, %p58; // begin inline asm { cvt.rn.f16.f32 %rs120, %f104;} // end inline asm st.global.u16 [%rd23+2048], %rs120; add.s64 %rd94, %rd94, 4096; add.s64 %rd93, %rd93, 4096; add.s32 %r272, %r272, 2048; setp.lt.s32 %p59, %r272, %r116; @%p59 bra $L__BB0_61; $L__BB0_120: ret; }