_INTERNAL_f319aae6_20_genmmha_fused_mbh_cu_7ba56fce6thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN51_INTERNAL_f319aae6_20_genmmha_fused_mbh_cu_7ba56fce6thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN51_INTERNAL_f319aae6_20_genmmha_fused_mbh_cu_7ba56fce6thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN51_INTERNAL_f319aae6_20_genmmha_fused_mbh_cu_7ba56fce6thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN51_INTERNAL_f319aae6_20_genmmha_fused_mbh_cu_7ba56fce6thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN51_INTERNAL_f319aae6_20_genmmha_fused_mbh_cu_7ba56fce6thrust12placeholders3_10E[1]; .extern .shared .align 16 .b8 smem[]; .visible .entry MultiHeadAttention_mbh_kernel( .param .align 8 .b8 MultiHeadAttention_mbh_kernel_param_0[144] ) .maxntid 1024, 1, 1 .minnctapersm 1 { .reg .pred %p<136>; .reg .b16 %rs<98>; .reg .f32 %f<457>; .reg .b32 %r<423>; .reg .b64 %rd<181>; mov.b64 %rd53, MultiHeadAttention_mbh_kernel_param_0; mov.u64 %rd1, %rd53; ld.param.u64 %rd54, [MultiHeadAttention_mbh_kernel_param_0+24]; cvta.to.global.u64 %rd2, %rd54; ld.param.u64 %rd3, [MultiHeadAttention_mbh_kernel_param_0+32]; ld.param.u64 %rd5, [MultiHeadAttention_mbh_kernel_param_0+8]; ld.param.u64 %rd7, [MultiHeadAttention_mbh_kernel_param_0+48]; ld.param.u64 %rd8, [MultiHeadAttention_mbh_kernel_param_0+56]; ld.param.v2.u32 {%r122, %r123}, [MultiHeadAttention_mbh_kernel_param_0+112]; ld.param.u32 %r4, [MultiHeadAttention_mbh_kernel_param_0+124]; mov.u32 %r5, %ctaid.x; mov.u32 %r6, %ctaid.y; mov.u32 %r7, %ctaid.z; ld.param.u64 %rd55, [MultiHeadAttention_mbh_kernel_param_0+80]; setp.eq.s64 %p1, %rd55, 0; mov.u32 %r393, %r122; @%p1 bra $L__BB0_2; ld.param.u64 %rd56, [%rd1+80]; cvta.to.global.u64 %rd57, %rd56; mul.wide.s32 %rd58, %r6, 4; add.s64 %rd59, %rd57, %rd58; ld.global.u32 %r124, [%rd59]; add.s32 %r393, %r124, 1; $L__BB0_2: add.s32 %r125, %r393, -1; rem.s32 %r10, %r125, %r122; min.s32 %r11, %r393, %r122; setp.gt.s32 %p2, %r11, 256; ld.param.u32 %r126, [%rd1+104]; selp.b32 %r12, %r126, 1, %p2; setp.ge.s32 %p3, %r7, %r12; @%p3 bra $L__BB0_105; add.s32 %r127, %r11, %r12; add.s32 %r128, %r127, -1; div.s32 %r13, %r128, %r12; mul.lo.s32 %r14, %r13, %r7; add.s32 %r129, %r14, %r13; min.s32 %r15, %r129, %r11; not.b32 %r130, %r14; add.s32 %r16, %r15, %r130; mad.lo.s32 %r17, %r123, %r6, %r5; mul.lo.s32 %r131, %r17, %r4; cvt.s64.s32 %rd9, %r131; shl.b32 %r132, %r4, 5; add.s32 %r18, %r132, 128; ld.param.u32 %r133, [%rd1+120]; div.s32 %r19, %r123, %r133; div.s32 %r134, %r5, %r19; mad.lo.s32 %r135, %r133, %r6, %r134; mul.lo.s32 %r20, %r135, %r4; mul.lo.s32 %r21, %r20, %r122; shr.u32 %r136, %r4, 31; add.s32 %r137, %r4, %r136; shr.s32 %r22, %r137, 1; mov.u32 %r23, %tid.x; setp.ge.s32 %p4, %r23, %r22; @%p4 bra $L__BB0_10; not.b32 %r138, %r23; add.s32 %r24, %r22, %r138; shr.u32 %r139, %r24, 10; add.s32 %r140, %r139, 1; and.b32 %r396, %r140, 3; setp.eq.s32 %p5, %r396, 0; mov.u32 %r397, %r23; @%p5 bra $L__BB0_7; shl.b32 %r141, %r23, 3; mov.u32 %r142, smem; add.s32 %r143, %r142, %r141; add.s32 %r394, %r143, 256; mul.wide.s32 %rd60, %r23, 2; add.s64 %rd61, %rd60, %rd9; shl.b64 %rd62, %rd61, 1; add.s64 %rd175, %rd2, %rd62; mov.u32 %r397, %r23; $L__BB0_6: .pragma "nounroll"; ld.global.u32 %r144, [%rd175]; // begin inline asm {.reg .f16 low,high; mov.b32 {low,high},%r144; cvt.f32.f16 %f91, high;} // end inline asm // begin inline asm {.reg .f16 low,high; mov.b32 {low,high},%r144; cvt.f32.f16 %f90, low;} // end inline asm st.shared.v2.f32 [%r394], {%f90, %f91}; add.s32 %r397, %r397, 1024; add.s32 %r394, %r394, 8192; add.s64 %rd175, %rd175, 4096; add.s32 %r396, %r396, -1; setp.ne.s32 %p6, %r396, 0; @%p6 bra $L__BB0_6; $L__BB0_7: setp.lt.u32 %p7, %r24, 3072; @%p7 bra $L__BB0_10; shl.b32 %r146, %r397, 3; mov.u32 %r147, smem; add.s32 %r148, %r147, %r146; add.s32 %r398, %r148, 16640; mul.wide.s32 %rd63, %r397, 2; add.s64 %rd64, %rd63, %rd9; shl.b64 %rd65, %rd64, 1; add.s64 %rd66, %rd2, %rd65; add.s64 %rd176, %rd66, 8192; $L__BB0_9: ld.global.u32 %r149, [%rd176+-8192]; // begin inline asm {.reg .f16 low,high; mov.b32 {low,high},%r149; cvt.f32.f16 %f93, high;} // end inline asm // begin inline asm {.reg .f16 low,high; mov.b32 {low,high},%r149; cvt.f32.f16 %f92, low;} // end inline asm st.shared.v2.f32 [%r398+-16384], {%f92, %f93}; ld.global.u32 %r151, [%rd176+-4096]; // begin inline asm {.reg .f16 low,high; mov.b32 {low,high},%r151; cvt.f32.f16 %f95, high;} // end inline asm // begin inline asm {.reg .f16 low,high; mov.b32 {low,high},%r151; cvt.f32.f16 %f94, low;} // end inline asm st.shared.v2.f32 [%r398+-8192], {%f94, %f95}; ld.global.u32 %r153, [%rd176]; // begin inline asm {.reg .f16 low,high; mov.b32 {low,high},%r153; cvt.f32.f16 %f97, high;} // end inline asm // begin inline asm {.reg .f16 low,high; mov.b32 {low,high},%r153; cvt.f32.f16 %f96, low;} // end inline asm st.shared.v2.f32 [%r398], {%f96, %f97}; ld.global.u32 %r155, [%rd176+4096]; // begin inline asm {.reg .f16 low,high; mov.b32 {low,high},%r155; cvt.f32.f16 %f99, high;} // end inline asm // begin inline asm {.reg .f16 low,high; mov.b32 {low,high},%r155; cvt.f32.f16 %f98, low;} // end inline asm st.shared.v2.f32 [%r398+8192], {%f98, %f99}; add.s32 %r398, %r398, 32768; add.s64 %rd176, %rd176, 16384; add.s32 %r397, %r397, 4096; setp.lt.s32 %p8, %r397, %r22; @%p8 bra $L__BB0_9; $L__BB0_10: bar.sync 0; mov.u32 %r157, WARP_SZ; div.u32 %r409, %r23, %r157; mul.lo.s32 %r158, %r409, %r157; sub.s32 %r40, %r23, %r158; add.s32 %r41, %r409, %r14; setp.ge.s32 %p9, %r41, %r15; @%p9 bra $L__BB0_28; cvt.s64.s32 %rd16, %r21; cvta.to.global.u64 %rd67, %rd3; mul.wide.s32 %rd68, %r20, 2; add.s64 %rd17, %rd67, %rd68; shr.s32 %r159, %r4, 31; shr.u32 %r160, %r159, 30; add.s32 %r161, %r4, %r160; shr.s32 %r42, %r161, 2; ld.param.v2.f32 {%f100, %f101}, [%rd1+128]; sub.s32 %r43, %r18, %r14; mad.lo.s32 %r162, %r4, %r6, %r5; mul.lo.s32 %r44, %r162, %r11; mul.lo.s32 %r45, %r11, %r6; cvta.to.global.u64 %rd18, %rd5; cvta.to.global.u64 %rd19, %rd7; cvta.to.global.u64 %rd20, %rd8; mov.u32 %r400, %r41; $L__BB0_12: setp.ge.s32 %p10, %r40, %r42; mov.f32 %f424, 0f00000000; @%p10 bra $L__BB0_18; mul.lo.s32 %r163, %r400, %r4; cvt.s64.s32 %rd69, %r163; add.s64 %rd70, %rd69, %rd16; shl.b64 %rd71, %rd70, 1; add.s64 %rd21, %rd18, %rd71; setp.eq.s32 %p11, %r400, %r10; selp.b64 %rd22, %rd17, %rd21, %p11; mov.u32 %r401, %r40; $L__BB0_14: cvt.s64.s32 %rd23, %r401; mul.wide.s32 %rd72, %r401, 8; add.s64 %rd73, %rd22, %rd72; ld.global.v4.u16 {%rs33, %rs34, %rs35, %rs36}, [%rd73]; setp.ne.s32 %p12, %r400, %r10; @%p12 bra $L__BB0_17; rem.s32 %r165, %r5, %r19; setp.ne.s32 %p13, %r165, 0; @%p13 bra $L__BB0_17; shl.b64 %rd74, %rd23, 3; add.s64 %rd75, %rd21, %rd74; st.global.v4.u16 [%rd75], {%rs33, %rs34, %rs35, %rs36}; $L__BB0_17: cvt.u32.u64 %r166, %rd23; shl.b32 %r167, %r166, 4; mov.u32 %r168, smem; add.s32 %r169, %r168, %r167; ld.shared.v4.f32 {%f108, %f109, %f110, %f111}, [%r169+256]; // begin inline asm { cvt.f32.f16 %f104, %rs33;} // end inline asm // begin inline asm { cvt.f32.f16 %f105, %rs34;} // end inline asm mul.ftz.f32 %f116, %f109, %f105; fma.rn.ftz.f32 %f117, %f108, %f104, %f116; // begin inline asm { cvt.f32.f16 %f106, %rs35;} // end inline asm fma.rn.ftz.f32 %f118, %f110, %f106, %f117; // begin inline asm { cvt.f32.f16 %f107, %rs36;} // end inline asm fma.rn.ftz.f32 %f119, %f111, %f107, %f118; add.ftz.f32 %f424, %f424, %f119; mov.u32 %r170, WARP_SZ; add.s32 %r401, %r166, %r170; setp.lt.s32 %p14, %r401, %r42; @%p14 bra $L__BB0_14; $L__BB0_18: mov.u32 %r402, WARP_SZ; setp.lt.s32 %p15, %r402, 2; bar.warp.sync -1; @%p15 bra $L__BB0_21; $L__BB0_20: mov.b32 %r172, %f424; shr.u32 %r173, %r402, 31; add.s32 %r174, %r402, %r173; shr.s32 %r51, %r174, 1; mov.u32 %r175, 31; mov.u32 %r176, -1; shfl.sync.down.b32 %r177|%p16, %r172, %r51, %r175, %r176; mov.b32 %f120, %r177; add.ftz.f32 %f424, %f424, %f120; setp.gt.s32 %p17, %r402, 3; mov.u32 %r402, %r51; @%p17 bra $L__BB0_20; $L__BB0_21: setp.ne.s32 %p18, %r40, 0; @%p18 bra $L__BB0_27; setp.eq.s64 %p19, %rd8, 0; mul.ftz.f32 %f425, %f424, %f100; @%p19 bra $L__BB0_24; add.s32 %r178, %r400, %r44; mul.wide.s32 %rd76, %r178, 2; add.s64 %rd77, %rd20, %rd76; ld.global.u16 %rs41, [%rd77]; // begin inline asm { cvt.f32.f16 %f121, %rs41;} // end inline asm add.ftz.f32 %f425, %f425, %f121; bra.uni $L__BB0_26; $L__BB0_24: setp.eq.s64 %p20, %rd7, 0; @%p20 bra $L__BB0_26; add.s32 %r179, %r400, %r45; mul.wide.s32 %rd78, %r179, 4; add.s64 %rd79, %rd19, %rd78; ld.global.u32 %r180, [%rd79]; setp.eq.s32 %p21, %r180, 0; selp.f32 %f122, %f101, 0f00000000, %p21; add.ftz.f32 %f425, %f425, %f122; $L__BB0_26: add.s32 %r181, %r43, %r400; shl.b32 %r182, %r181, 2; mov.u32 %r183, smem; add.s32 %r184, %r183, %r182; st.shared.f32 [%r184+256], %f425; $L__BB0_27: add.s32 %r400, %r400, 32; setp.lt.s32 %p22, %r400, %r15; @%p22 bra $L__BB0_12; $L__BB0_28: bar.sync 0; mov.u32 %r53, %ntid.x; mov.u32 %r185, %tid.x; setp.gt.s32 %p23, %r185, %r16; mov.f32 %f428, 0fFF7FFFFF; @%p23 bra $L__BB0_30; add.s32 %r187, %r18, %r185; shl.b32 %r188, %r187, 2; mov.u32 %r189, smem; add.s32 %r190, %r189, %r188; ld.shared.f32 %f428, [%r190+256]; $L__BB0_30: add.s32 %r403, %r185, %r53; setp.gt.s32 %p24, %r403, %r16; @%p24 bra $L__BB0_33; mov.u32 %r194, smem; $L__BB0_32: add.s32 %r192, %r403, %r18; shl.b32 %r193, %r192, 2; add.s32 %r195, %r194, %r193; ld.shared.f32 %f124, [%r195+256]; setp.gt.ftz.f32 %p25, %f124, %f428; selp.f32 %f428, %f124, %f428, %p25; add.s32 %r403, %r403, %r53; setp.le.s32 %p26, %r403, %r16; @%p26 bra $L__BB0_32; $L__BB0_33: shr.s32 %r224, %r185, 31; shr.u32 %r225, %r224, 27; add.s32 %r226, %r185, %r225; shr.s32 %r227, %r226, 5; // begin inline asm mov.u32 %r196, %laneid; // end inline asm // begin inline asm mov.u32 %r197, %laneid; // end inline asm mov.b32 %r199, %f428; mov.u32 %r200, 1; mov.u32 %r221, 31; mov.u32 %r222, -1; // begin inline asm shfl.sync.down.b32 %r198, %r199, %r200, %r221, %r222; // end inline asm mov.b32 %f125, %r198; setp.lt.s32 %p27, %r197, 31; setp.lt.ftz.f32 %p28, %f428, %f125; and.pred %p29, %p27, %p28; selp.f32 %f126, %f125, %f428, %p29; mov.b32 %r204, %f126; mov.u32 %r205, 2; // begin inline asm shfl.sync.down.b32 %r203, %r204, %r205, %r221, %r222; // end inline asm mov.b32 %f127, %r203; setp.lt.s32 %p30, %r197, 30; setp.lt.ftz.f32 %p31, %f126, %f127; and.pred %p32, %p30, %p31; selp.f32 %f128, %f127, %f126, %p32; mov.b32 %r209, %f128; mov.u32 %r210, 4; // begin inline asm shfl.sync.down.b32 %r208, %r209, %r210, %r221, %r222; // end inline asm mov.b32 %f129, %r208; setp.lt.s32 %p33, %r197, 28; setp.lt.ftz.f32 %p34, %f128, %f129; and.pred %p35, %p33, %p34; selp.f32 %f130, %f129, %f128, %p35; mov.b32 %r214, %f130; mov.u32 %r215, 8; // begin inline asm shfl.sync.down.b32 %r213, %r214, %r215, %r221, %r222; // end inline asm mov.b32 %f131, %r213; setp.lt.s32 %p36, %r197, 24; setp.lt.ftz.f32 %p37, %f130, %f131; and.pred %p38, %p36, %p37; selp.f32 %f132, %f131, %f130, %p38; mov.b32 %r219, %f132; mov.u32 %r220, 16; // begin inline asm shfl.sync.down.b32 %r218, %r219, %r220, %r221, %r222; // end inline asm mov.b32 %f133, %r218; setp.lt.s32 %p39, %r197, 16; setp.lt.ftz.f32 %p40, %f132, %f133; and.pred %p41, %p39, %p40; selp.f32 %f429, %f133, %f132, %p41; setp.ne.s32 %p42, %r196, 0; shl.b32 %r228, %r227, 2; mov.u32 %r229, smem; add.s32 %r230, %r229, %r228; @%p42 bra $L__BB0_35; add.s32 %r377, %r230, 32; st.shared.f32 [%r377], %f429; $L__BB0_35: bar.sync 0; setp.ne.s32 %p43, %r185, 0; @%p43 bra $L__BB0_37; ld.shared.f32 %f134, [smem+36]; setp.gt.ftz.f32 %p44, %f134, %f429; selp.f32 %f135, %f134, %f429, %p44; ld.shared.v2.f32 {%f136, %f137}, [smem+40]; setp.gt.ftz.f32 %p45, %f136, %f135; selp.f32 %f140, %f136, %f135, %p45; setp.gt.ftz.f32 %p46, %f137, %f140; selp.f32 %f141, %f137, %f140, %p46; ld.shared.v4.f32 {%f142, %f143, %f144, %f145}, [smem+48]; setp.gt.ftz.f32 %p47, %f142, %f141; selp.f32 %f150, %f142, %f141, %p47; setp.gt.ftz.f32 %p48, %f143, %f150; selp.f32 %f151, %f143, %f150, %p48; setp.gt.ftz.f32 %p49, %f144, %f151; selp.f32 %f152, %f144, %f151, %p49; setp.gt.ftz.f32 %p50, %f145, %f152; selp.f32 %f153, %f145, %f152, %p50; ld.shared.v4.f32 {%f154, %f155, %f156, %f157}, [smem+64]; setp.gt.ftz.f32 %p51, %f154, %f153; selp.f32 %f162, %f154, %f153, %p51; setp.gt.ftz.f32 %p52, %f155, %f162; selp.f32 %f163, %f155, %f162, %p52; setp.gt.ftz.f32 %p53, %f156, %f163; selp.f32 %f164, %f156, %f163, %p53; setp.gt.ftz.f32 %p54, %f157, %f164; selp.f32 %f165, %f157, %f164, %p54; ld.shared.v4.f32 {%f166, %f167, %f168, %f169}, [smem+80]; setp.gt.ftz.f32 %p55, %f166, %f165; selp.f32 %f174, %f166, %f165, %p55; setp.gt.ftz.f32 %p56, %f167, %f174; selp.f32 %f175, %f167, %f174, %p56; setp.gt.ftz.f32 %p57, %f168, %f175; selp.f32 %f176, %f168, %f175, %p57; setp.gt.ftz.f32 %p58, %f169, %f176; selp.f32 %f177, %f169, %f176, %p58; ld.shared.v4.f32 {%f178, %f179, %f180, %f181}, [smem+96]; setp.gt.ftz.f32 %p59, %f178, %f177; selp.f32 %f186, %f178, %f177, %p59; setp.gt.ftz.f32 %p60, %f179, %f186; selp.f32 %f187, %f179, %f186, %p60; setp.gt.ftz.f32 %p61, %f180, %f187; selp.f32 %f188, %f180, %f187, %p61; setp.gt.ftz.f32 %p62, %f181, %f188; selp.f32 %f189, %f181, %f188, %p62; ld.shared.v4.f32 {%f190, %f191, %f192, %f193}, [smem+112]; setp.gt.ftz.f32 %p63, %f190, %f189; selp.f32 %f198, %f190, %f189, %p63; setp.gt.ftz.f32 %p64, %f191, %f198; selp.f32 %f199, %f191, %f198, %p64; setp.gt.ftz.f32 %p65, %f192, %f199; selp.f32 %f200, %f192, %f199, %p65; setp.gt.ftz.f32 %p66, %f193, %f200; selp.f32 %f201, %f193, %f200, %p66; ld.shared.v4.f32 {%f202, %f203, %f204, %f205}, [smem+128]; setp.gt.ftz.f32 %p67, %f202, %f201; selp.f32 %f210, %f202, %f201, %p67; setp.gt.ftz.f32 %p68, %f203, %f210; selp.f32 %f211, %f203, %f210, %p68; setp.gt.ftz.f32 %p69, %f204, %f211; selp.f32 %f212, %f204, %f211, %p69; setp.gt.ftz.f32 %p70, %f205, %f212; selp.f32 %f213, %f205, %f212, %p70; ld.shared.v4.f32 {%f214, %f215, %f216, %f217}, [smem+144]; setp.gt.ftz.f32 %p71, %f214, %f213; selp.f32 %f222, %f214, %f213, %p71; setp.gt.ftz.f32 %p72, %f215, %f222; selp.f32 %f223, %f215, %f222, %p72; setp.gt.ftz.f32 %p73, %f216, %f223; selp.f32 %f224, %f216, %f223, %p73; setp.gt.ftz.f32 %p74, %f217, %f224; selp.f32 %f429, %f217, %f224, %p74; $L__BB0_37: mov.u32 %r379, %tid.x; setp.ne.s32 %p133, %r379, 0; @%p133 bra $L__BB0_39; st.shared.f32 [smem+164], %f429; $L__BB0_39: mov.u32 %r380, %tid.x; setp.gt.s32 %p134, %r380, %r16; bar.sync 0; mov.f32 %f431, 0f00000000; ld.shared.f32 %f21, [smem+164]; @%p134 bra $L__BB0_42; mov.u32 %r404, %tid.x; $L__BB0_41: add.s32 %r234, %r404, %r18; shl.b32 %r235, %r234, 2; add.s32 %r237, %r229, %r235; ld.shared.f32 %f227, [%r237+256]; sub.ftz.f32 %f228, %f227, %f21; mul.ftz.f32 %f229, %f228, 0f3FB8AA3B; ex2.approx.ftz.f32 %f230, %f229; add.ftz.f32 %f431, %f431, %f230; st.shared.f32 [%r237+256], %f230; add.s32 %r404, %r404, %r53; setp.le.s32 %p77, %r404, %r16; @%p77 bra $L__BB0_41; $L__BB0_42: bar.sync 0; // begin inline asm mov.u32 %r238, %laneid; // end inline asm mov.u32 %r240, 1; mov.u32 %r253, 31; mov.u32 %r254, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f431, %r240, %r253, %r254; @p add.f32 r0, r0, %f431; mov.f32 %f231, r0;} // end inline asm mov.u32 %r243, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f231, %r243, %r253, %r254; @p add.f32 r0, r0, %f231; mov.f32 %f234, r0;} // end inline asm mov.u32 %r246, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f234, %r246, %r253, %r254; @p add.f32 r0, r0, %f234; mov.f32 %f237, r0;} // end inline asm mov.u32 %r249, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f237, %r249, %r253, %r254; @p add.f32 r0, r0, %f237; mov.f32 %f240, r0;} // end inline asm mov.u32 %r252, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f240, %r252, %r253, %r254; @p add.f32 r0, r0, %f240; mov.f32 %f432, r0;} // end inline asm setp.ne.s32 %p78, %r238, 0; @%p78 bra $L__BB0_44; add.s32 %r378, %r230, 32; st.shared.f32 [%r378], %f432; $L__BB0_44: mov.u32 %r414, %tid.x; setp.ne.s32 %p79, %r414, 0; bar.sync 0; @%p79 bra $L__BB0_46; ld.shared.f32 %f246, [smem+36]; add.ftz.f32 %f247, %f432, %f246; ld.shared.v2.f32 {%f248, %f249}, [smem+40]; add.ftz.f32 %f252, %f247, %f248; add.ftz.f32 %f253, %f252, %f249; ld.shared.v4.f32 {%f254, %f255, %f256, %f257}, [smem+48]; add.ftz.f32 %f262, %f253, %f254; add.ftz.f32 %f263, %f262, %f255; add.ftz.f32 %f264, %f263, %f256; add.ftz.f32 %f265, %f264, %f257; ld.shared.v4.f32 {%f266, %f267, %f268, %f269}, [smem+64]; add.ftz.f32 %f274, %f265, %f266; add.ftz.f32 %f275, %f274, %f267; add.ftz.f32 %f276, %f275, %f268; add.ftz.f32 %f277, %f276, %f269; ld.shared.v4.f32 {%f278, %f279, %f280, %f281}, [smem+80]; add.ftz.f32 %f286, %f277, %f278; add.ftz.f32 %f287, %f286, %f279; add.ftz.f32 %f288, %f287, %f280; add.ftz.f32 %f289, %f288, %f281; ld.shared.v4.f32 {%f290, %f291, %f292, %f293}, [smem+96]; add.ftz.f32 %f298, %f289, %f290; add.ftz.f32 %f299, %f298, %f291; add.ftz.f32 %f300, %f299, %f292; add.ftz.f32 %f301, %f300, %f293; ld.shared.v4.f32 {%f302, %f303, %f304, %f305}, [smem+112]; add.ftz.f32 %f310, %f301, %f302; add.ftz.f32 %f311, %f310, %f303; add.ftz.f32 %f312, %f311, %f304; add.ftz.f32 %f313, %f312, %f305; ld.shared.v4.f32 {%f314, %f315, %f316, %f317}, [smem+128]; add.ftz.f32 %f322, %f313, %f314; add.ftz.f32 %f323, %f322, %f315; add.ftz.f32 %f324, %f323, %f316; add.ftz.f32 %f325, %f324, %f317; ld.shared.v4.f32 {%f326, %f327, %f328, %f329}, [smem+144]; add.ftz.f32 %f334, %f325, %f326; add.ftz.f32 %f335, %f334, %f327; add.ftz.f32 %f336, %f335, %f328; add.ftz.f32 %f432, %f336, %f329; $L__BB0_46: @%p79 bra $L__BB0_48; st.shared.f32 [smem+164], %f432; $L__BB0_48: min.s32 %r382, %r393, %r122; bar.sync 0; ld.shared.f32 %f28, [smem+164]; setp.lt.s32 %p82, %r382, 257; or.pred %p83, %p79, %p82; @%p83 bra $L__BB0_50; mov.u32 %r392, %ctaid.y; mad.lo.s32 %r391, %r123, %r392, %r5; mov.u32 %r390, %ctaid.z; mov.b64 %rd174, MultiHeadAttention_mbh_kernel_param_0; mov.u64 %rd173, %rd174; ld.param.u64 %rd81, [%rd173+96]; cvta.to.global.u64 %rd82, %rd81; mad.lo.s32 %r259, %r12, %r391, %r390; mul.wide.s32 %rd83, %r259, 8; add.s64 %rd84, %rd82, %rd83; st.global.f32 [%rd84], %f21; st.global.f32 [%rd84+4], %f28; $L__BB0_50: rcp.approx.ftz.f32 %f29, %f28; shr.s32 %r260, %r4, 31; shr.u32 %r261, %r260, 30; add.s32 %r262, %r4, %r261; shr.s32 %r61, %r262, 2; setp.ge.s32 %p84, %r40, %r61; @%p84 bra $L__BB0_82; mov.u32 %r389, %ctaid.z; ld.param.u64 %rd172, [MultiHeadAttention_mbh_kernel_param_0+16]; ld.param.u64 %rd171, [MultiHeadAttention_mbh_kernel_param_0+40]; mul.lo.s32 %r388, %r13, %r389; cvt.s64.s32 %rd24, %r21; cvta.to.global.u64 %rd85, %rd171; mul.wide.s32 %rd86, %r20, 2; add.s64 %rd25, %rd85, %rd86; add.s32 %r263, %r4, 4; mul.lo.s32 %r264, %r409, %r263; shl.b32 %r265, %r264, 2; add.s32 %r267, %r229, 256; add.s32 %r62, %r267, %r265; add.s32 %r269, %r389, 1; mul.lo.s32 %r270, %r13, %r269; min.s32 %r271, %r270, %r122; min.s32 %r272, %r271, %r393; not.b32 %r273, %r272; mov.u32 %r274, -2; sub.s32 %r275, %r274, %r388; sub.s32 %r276, %r275, %r273; sub.s32 %r63, %r276, %r409; shr.u32 %r277, %r63, 5; add.s32 %r278, %r277, 1; and.b32 %r64, %r278, 3; mul.lo.s32 %r279, %r41, %r4; cvt.s64.s32 %rd87, %r279; add.s64 %rd88, %rd87, %rd24; cvta.to.global.u64 %rd89, %rd172; shl.b64 %rd90, %rd88, 1; add.s64 %rd26, %rd89, %rd90; setp.eq.s32 %p85, %r41, %r10; selp.b64 %rd27, %rd25, %rd26, %p85; add.s32 %r280, %r409, %r18; shl.b32 %r281, %r280, 2; add.s32 %r65, %r267, %r281; add.s32 %r283, %r279, %r132; cvt.s64.s32 %rd91, %r283; add.s64 %rd92, %rd91, %rd24; shl.b64 %rd93, %rd92, 1; add.s64 %rd28, %rd89, %rd93; add.s32 %r66, %r41, 32; setp.eq.s32 %p86, %r66, %r10; selp.b64 %rd29, %rd25, %rd28, %p86; add.s32 %r284, %r283, %r132; cvt.s64.s32 %rd94, %r284; add.s64 %rd95, %rd94, %rd24; shl.b64 %rd96, %rd95, 1; add.s64 %rd30, %rd89, %rd96; add.s32 %r67, %r41, 64; setp.eq.s32 %p87, %r67, %r10; selp.b64 %rd31, %rd25, %rd30, %p87; add.s32 %r68, %r41, 96; shl.b32 %r285, %r4, 7; add.s32 %r286, %r285, 768; mul.lo.s32 %r287, %r389, %r13; shl.b32 %r288, %r287, 2; sub.s32 %r289, %r286, %r288; add.s32 %r69, %r229, %r289; mov.u32 %r405, %r40; $L__BB0_52: mov.f32 %f445, 0f00000000; mov.f32 %f446, %f445; mov.f32 %f447, %f445; mov.f32 %f448, %f445; @%p9 bra $L__BB0_81; setp.eq.s32 %p89, %r64, 0; cvt.s64.s32 %rd32, %r405; mov.f32 %f448, 0f00000000; mov.u32 %r406, %r41; mov.f32 %f447, %f448; mov.f32 %f446, %f448; mov.f32 %f445, %f448; @%p89 bra $L__BB0_66; shl.b64 %rd97, %rd32, 3; add.s64 %rd98, %rd27, %rd97; ld.global.v4.u16 {%rs42, %rs43, %rs44, %rs45}, [%rd98]; setp.ne.s32 %p90, %r41, %r10; @%p90 bra $L__BB0_57; rem.s32 %r291, %r5, %r19; setp.ne.s32 %p91, %r291, 0; @%p91 bra $L__BB0_57; add.s64 %rd100, %rd26, %rd97; st.global.v4.u16 [%rd100], {%rs42, %rs43, %rs44, %rs45}; $L__BB0_57: setp.eq.s32 %p92, %r64, 1; ld.shared.f32 %f350, [%r65]; mul.ftz.f32 %f351, %f29, %f350; // begin inline asm { cvt.f32.f16 %f346, %rs42;} // end inline asm fma.rn.ftz.f32 %f448, %f351, %f346, 0f00000000; // begin inline asm { cvt.f32.f16 %f347, %rs43;} // end inline asm fma.rn.ftz.f32 %f447, %f351, %f347, 0f00000000; // begin inline asm { cvt.f32.f16 %f348, %rs44;} // end inline asm fma.rn.ftz.f32 %f446, %f351, %f348, 0f00000000; // begin inline asm { cvt.f32.f16 %f349, %rs45;} // end inline asm fma.rn.ftz.f32 %f445, %f351, %f349, 0f00000000; mov.u32 %r406, %r66; @%p92 bra $L__BB0_66; add.s64 %rd102, %rd29, %rd97; ld.global.v4.u16 {%rs50, %rs51, %rs52, %rs53}, [%rd102]; setp.ne.s32 %p93, %r66, %r10; @%p93 bra $L__BB0_61; rem.s32 %r293, %r5, %r19; setp.ne.s32 %p94, %r293, 0; @%p94 bra $L__BB0_61; add.s64 %rd104, %rd28, %rd97; st.global.v4.u16 [%rd104], {%rs50, %rs51, %rs52, %rs53}; $L__BB0_61: setp.eq.s32 %p95, %r64, 2; ld.shared.f32 %f356, [%r65+128]; mul.ftz.f32 %f357, %f29, %f356; // begin inline asm { cvt.f32.f16 %f352, %rs50;} // end inline asm fma.rn.ftz.f32 %f448, %f357, %f352, %f448; // begin inline asm { cvt.f32.f16 %f353, %rs51;} // end inline asm fma.rn.ftz.f32 %f447, %f357, %f353, %f447; // begin inline asm { cvt.f32.f16 %f354, %rs52;} // end inline asm fma.rn.ftz.f32 %f446, %f357, %f354, %f446; // begin inline asm { cvt.f32.f16 %f355, %rs53;} // end inline asm fma.rn.ftz.f32 %f445, %f357, %f355, %f445; mov.u32 %r406, %r67; @%p95 bra $L__BB0_66; add.s64 %rd106, %rd31, %rd97; ld.global.v4.u16 {%rs58, %rs59, %rs60, %rs61}, [%rd106]; setp.ne.s32 %p96, %r67, %r10; @%p96 bra $L__BB0_65; rem.s32 %r295, %r5, %r19; setp.ne.s32 %p97, %r295, 0; @%p97 bra $L__BB0_65; add.s64 %rd108, %rd30, %rd97; st.global.v4.u16 [%rd108], {%rs58, %rs59, %rs60, %rs61}; $L__BB0_65: ld.shared.f32 %f362, [%r65+256]; mul.ftz.f32 %f363, %f29, %f362; // begin inline asm { cvt.f32.f16 %f358, %rs58;} // end inline asm fma.rn.ftz.f32 %f448, %f363, %f358, %f448; // begin inline asm { cvt.f32.f16 %f359, %rs59;} // end inline asm fma.rn.ftz.f32 %f447, %f363, %f359, %f447; // begin inline asm { cvt.f32.f16 %f360, %rs60;} // end inline asm fma.rn.ftz.f32 %f446, %f363, %f360, %f446; // begin inline asm { cvt.f32.f16 %f361, %rs61;} // end inline asm fma.rn.ftz.f32 %f445, %f363, %f361, %f445; mov.u32 %r406, %r68; $L__BB0_66: setp.lt.u32 %p98, %r63, 96; @%p98 bra $L__BB0_81; shl.b32 %r296, %r406, 2; add.s32 %r408, %r69, %r296; $L__BB0_68: .pragma "nounroll"; mov.u32 %r74, %r408; mul.lo.s32 %r75, %r406, %r4; cvt.s64.s32 %rd109, %r75; add.s64 %rd110, %rd109, %rd24; shl.b64 %rd112, %rd110, 1; add.s64 %rd33, %rd89, %rd112; setp.ne.s32 %p99, %r406, %r10; setp.eq.s32 %p100, %r406, %r10; selp.b64 %rd113, %rd25, %rd33, %p100; shl.b64 %rd114, %rd32, 3; add.s64 %rd115, %rd113, %rd114; ld.global.v4.u16 {%rs66, %rs67, %rs68, %rs69}, [%rd115]; @%p99 bra $L__BB0_71; rem.s32 %r298, %r5, %r19; setp.ne.s32 %p101, %r298, 0; @%p101 bra $L__BB0_71; add.s64 %rd117, %rd33, %rd114; st.global.v4.u16 [%rd117], {%rs66, %rs67, %rs68, %rs69}; $L__BB0_71: ld.shared.f32 %f368, [%r74]; mul.ftz.f32 %f369, %f29, %f368; // begin inline asm { cvt.f32.f16 %f364, %rs66;} // end inline asm fma.rn.ftz.f32 %f54, %f369, %f364, %f448; // begin inline asm { cvt.f32.f16 %f365, %rs67;} // end inline asm fma.rn.ftz.f32 %f55, %f369, %f365, %f447; // begin inline asm { cvt.f32.f16 %f366, %rs68;} // end inline asm fma.rn.ftz.f32 %f56, %f369, %f366, %f446; // begin inline asm { cvt.f32.f16 %f367, %rs69;} // end inline asm fma.rn.ftz.f32 %f57, %f369, %f367, %f445; add.s32 %r76, %r75, %r132; cvt.s64.s32 %rd118, %r76; add.s64 %rd119, %rd118, %rd24; shl.b64 %rd121, %rd119, 1; add.s64 %rd34, %rd89, %rd121; add.s32 %r300, %r406, 32; setp.ne.s32 %p102, %r300, %r10; setp.eq.s32 %p103, %r300, %r10; selp.b64 %rd122, %rd25, %rd34, %p103; add.s64 %rd124, %rd122, %rd114; ld.global.v4.u16 {%rs74, %rs75, %rs76, %rs77}, [%rd124]; @%p102 bra $L__BB0_74; rem.s32 %r302, %r5, %r19; setp.ne.s32 %p104, %r302, 0; @%p104 bra $L__BB0_74; add.s64 %rd126, %rd34, %rd114; st.global.v4.u16 [%rd126], {%rs74, %rs75, %rs76, %rs77}; $L__BB0_74: ld.shared.f32 %f374, [%r74+128]; mul.ftz.f32 %f375, %f29, %f374; // begin inline asm { cvt.f32.f16 %f370, %rs74;} // end inline asm fma.rn.ftz.f32 %f58, %f375, %f370, %f54; // begin inline asm { cvt.f32.f16 %f371, %rs75;} // end inline asm fma.rn.ftz.f32 %f59, %f375, %f371, %f55; // begin inline asm { cvt.f32.f16 %f372, %rs76;} // end inline asm fma.rn.ftz.f32 %f60, %f375, %f372, %f56; // begin inline asm { cvt.f32.f16 %f373, %rs77;} // end inline asm fma.rn.ftz.f32 %f61, %f375, %f373, %f57; add.s32 %r77, %r76, %r132; cvt.s64.s32 %rd127, %r77; add.s64 %rd128, %rd127, %rd24; shl.b64 %rd130, %rd128, 1; add.s64 %rd35, %rd89, %rd130; add.s32 %r304, %r406, 64; setp.ne.s32 %p105, %r304, %r10; setp.eq.s32 %p106, %r304, %r10; selp.b64 %rd131, %rd25, %rd35, %p106; add.s64 %rd133, %rd131, %rd114; ld.global.v4.u16 {%rs82, %rs83, %rs84, %rs85}, [%rd133]; @%p105 bra $L__BB0_77; rem.s32 %r306, %r5, %r19; setp.ne.s32 %p107, %r306, 0; @%p107 bra $L__BB0_77; add.s64 %rd135, %rd35, %rd114; st.global.v4.u16 [%rd135], {%rs82, %rs83, %rs84, %rs85}; $L__BB0_77: ld.shared.f32 %f380, [%r74+256]; mul.ftz.f32 %f381, %f29, %f380; // begin inline asm { cvt.f32.f16 %f376, %rs82;} // end inline asm fma.rn.ftz.f32 %f62, %f381, %f376, %f58; // begin inline asm { cvt.f32.f16 %f377, %rs83;} // end inline asm fma.rn.ftz.f32 %f63, %f381, %f377, %f59; // begin inline asm { cvt.f32.f16 %f378, %rs84;} // end inline asm fma.rn.ftz.f32 %f64, %f381, %f378, %f60; // begin inline asm { cvt.f32.f16 %f379, %rs85;} // end inline asm fma.rn.ftz.f32 %f65, %f381, %f379, %f61; add.s32 %r308, %r77, %r132; cvt.s64.s32 %rd136, %r308; add.s64 %rd137, %rd136, %rd24; shl.b64 %rd139, %rd137, 1; add.s64 %rd36, %rd89, %rd139; add.s32 %r309, %r406, 96; setp.ne.s32 %p108, %r309, %r10; setp.eq.s32 %p109, %r309, %r10; selp.b64 %rd140, %rd25, %rd36, %p109; add.s64 %rd142, %rd140, %rd114; ld.global.v4.u16 {%rs90, %rs91, %rs92, %rs93}, [%rd142]; @%p108 bra $L__BB0_80; rem.s32 %r311, %r5, %r19; setp.ne.s32 %p110, %r311, 0; @%p110 bra $L__BB0_80; add.s64 %rd144, %rd36, %rd114; st.global.v4.u16 [%rd144], {%rs90, %rs91, %rs92, %rs93}; $L__BB0_80: add.s32 %r408, %r74, 512; ld.shared.f32 %f386, [%r74+384]; mul.ftz.f32 %f387, %f29, %f386; // begin inline asm { cvt.f32.f16 %f382, %rs90;} // end inline asm fma.rn.ftz.f32 %f448, %f387, %f382, %f62; // begin inline asm { cvt.f32.f16 %f383, %rs91;} // end inline asm fma.rn.ftz.f32 %f447, %f387, %f383, %f63; // begin inline asm { cvt.f32.f16 %f384, %rs92;} // end inline asm fma.rn.ftz.f32 %f446, %f387, %f384, %f64; // begin inline asm { cvt.f32.f16 %f385, %rs93;} // end inline asm fma.rn.ftz.f32 %f445, %f387, %f385, %f65; add.s32 %r406, %r406, 128; setp.lt.s32 %p111, %r406, %r15; @%p111 bra $L__BB0_68; $L__BB0_81: shl.b32 %r312, %r405, 4; add.s32 %r313, %r62, %r312; st.shared.v4.f32 [%r313], {%f448, %f447, %f446, %f445}; mov.u32 %r314, WARP_SZ; add.s32 %r405, %r405, %r314; setp.lt.s32 %p112, %r405, %r61; @%p112 bra $L__BB0_52; $L__BB0_82: bar.sync 0; setp.ge.s32 %p113, %r409, %r61; @%p113 bra $L__BB0_90; add.s32 %r315, %r4, 4; mul.lo.s32 %r316, %r40, %r315; shl.b32 %r317, %r316, 2; add.s32 %r319, %r229, %r317; add.s32 %r81, %r319, 256; $L__BB0_84: shl.b32 %r320, %r409, 4; add.s32 %r321, %r81, %r320; ld.shared.v4.f32 {%f453, %f454, %f455, %f456}, [%r321]; bar.warp.sync -1; setp.lt.s32 %p114, %r157, 2; @%p114 bra $L__BB0_87; mov.u32 %r410, %r157; $L__BB0_86: mov.b32 %r322, %f453; shr.u32 %r323, %r410, 31; add.s32 %r324, %r410, %r323; shr.s32 %r86, %r324, 1; mov.u32 %r325, 31; mov.u32 %r326, -1; shfl.sync.down.b32 %r327|%p115, %r322, %r86, %r325, %r326; mov.b32 %f392, %r327; add.ftz.f32 %f453, %f453, %f392; mov.b32 %r328, %f454; shfl.sync.down.b32 %r329|%p116, %r328, %r86, %r325, %r326; mov.b32 %f393, %r329; add.ftz.f32 %f454, %f454, %f393; mov.b32 %r330, %f455; shfl.sync.down.b32 %r331|%p117, %r330, %r86, %r325, %r326; mov.b32 %f394, %r331; add.ftz.f32 %f455, %f455, %f394; mov.b32 %r332, %f456; shfl.sync.down.b32 %r333|%p118, %r332, %r86, %r325, %r326; mov.b32 %f395, %r333; add.ftz.f32 %f456, %f456, %f395; setp.gt.s32 %p119, %r410, 3; mov.u32 %r410, %r86; @%p119 bra $L__BB0_86; $L__BB0_87: setp.ne.s32 %p120, %r40, 0; @%p120 bra $L__BB0_89; add.s32 %r336, %r229, %r320; st.shared.v4.f32 [%r336+256], {%f453, %f454, %f455, %f456}; $L__BB0_89: add.s32 %r409, %r409, 32; setp.lt.s32 %p121, %r409, %r61; @%p121 bra $L__BB0_84; $L__BB0_90: min.s32 %r383, %r393, %r122; setp.gt.s32 %p135, %r383, 256; bar.sync 0; @%p135 bra $L__BB0_98; bra.uni $L__BB0_91; $L__BB0_98: mov.u32 %r387, %ctaid.y; mad.lo.s32 %r386, %r123, %r387, %r5; mul.lo.s32 %r385, %r386, %r4; mov.u32 %r384, %ctaid.z; mov.b64 %rd170, MultiHeadAttention_mbh_kernel_param_0; mov.u64 %rd169, %rd170; ld.param.u64 %rd157, [%rd169+88]; cvta.to.global.u64 %rd44, %rd157; mul.lo.s32 %r364, %r12, %r385; cvt.s64.s32 %rd45, %r364; mul.lo.s32 %r366, %r4, %r384; cvt.s64.s32 %rd46, %r366; mov.u32 %r420, %tid.x; setp.ge.s32 %p128, %r420, %r4; @%p128 bra $L__BB0_105; not.b32 %r368, %r420; add.s32 %r106, %r4, %r368; shr.u32 %r369, %r106, 10; add.s32 %r370, %r369, 1; and.b32 %r419, %r370, 3; setp.eq.s32 %p129, %r419, 0; @%p129 bra $L__BB0_102; mov.u32 %r420, %tid.x; cvt.s64.s32 %rd158, %r420; add.s64 %rd159, %rd158, %rd45; add.s64 %rd160, %rd159, %rd46; shl.b64 %rd161, %rd160, 2; add.s64 %rd179, %rd44, %rd161; shl.b32 %r371, %r420, 2; add.s32 %r373, %r229, %r371; add.s32 %r417, %r373, 256; $L__BB0_101: .pragma "nounroll"; ld.shared.f32 %f416, [%r417]; st.global.f32 [%rd179], %f416; add.s32 %r420, %r420, 1024; add.s64 %rd179, %rd179, 4096; add.s32 %r417, %r417, 4096; add.s32 %r419, %r419, -1; setp.ne.s32 %p130, %r419, 0; @%p130 bra $L__BB0_101; $L__BB0_102: setp.lt.u32 %p131, %r106, 3072; @%p131 bra $L__BB0_105; shl.b32 %r374, %r420, 2; add.s32 %r376, %r229, %r374; add.s32 %r421, %r376, 8448; cvt.s64.s32 %rd162, %r420; add.s64 %rd163, %rd46, %rd45; add.s64 %rd164, %rd163, %rd162; shl.b64 %rd165, %rd164, 2; add.s64 %rd166, %rd44, %rd165; add.s64 %rd180, %rd166, 8192; $L__BB0_104: ld.shared.f32 %f417, [%r421+-8192]; st.global.f32 [%rd180+-8192], %f417; ld.shared.f32 %f418, [%r421+-4096]; st.global.f32 [%rd180+-4096], %f418; ld.shared.f32 %f419, [%r421]; st.global.f32 [%rd180], %f419; ld.shared.f32 %f420, [%r421+4096]; st.global.f32 [%rd180+4096], %f420; add.s32 %r421, %r421, 16384; add.s64 %rd180, %rd180, 16384; add.s32 %r420, %r420, 4096; setp.lt.s32 %p132, %r420, %r4; @%p132 bra $L__BB0_104; bra.uni $L__BB0_105; $L__BB0_91: mov.b64 %rd168, MultiHeadAttention_mbh_kernel_param_0; mov.u64 %rd167, %rd168; setp.ge.s32 %p123, %r414, %r22; ld.param.u64 %rd146, [%rd167]; cvta.to.global.u64 %rd37, %rd146; @%p123 bra $L__BB0_105; not.b32 %r338, %r414; add.s32 %r89, %r22, %r338; shr.u32 %r339, %r89, 10; add.s32 %r340, %r339, 1; and.b32 %r413, %r340, 3; setp.eq.s32 %p124, %r413, 0; @%p124 bra $L__BB0_95; mov.u32 %r414, %tid.x; shl.b32 %r341, %r414, 3; add.s32 %r343, %r229, %r341; add.s32 %r411, %r343, 256; mul.wide.s32 %rd147, %r414, 2; add.s64 %rd149, %rd147, %rd9; shl.b64 %rd150, %rd149, 1; add.s64 %rd177, %rd37, %rd150; $L__BB0_94: .pragma "nounroll"; ld.shared.v2.f32 {%f398, %f399}, [%r411]; // begin inline asm {.reg .f16 low,high; cvt.rn.f16.f32 low, %f398; cvt.rn.f16.f32 high, %f399; mov.b32 %r348, {low,high};} // end inline asm st.global.u32 [%rd177], %r348; add.s32 %r414, %r414, 1024; add.s32 %r411, %r411, 8192; add.s64 %rd177, %rd177, 4096; add.s32 %r413, %r413, -1; setp.ne.s32 %p125, %r413, 0; @%p125 bra $L__BB0_94; $L__BB0_95: setp.lt.u32 %p126, %r89, 3072; @%p126 bra $L__BB0_105; mul.wide.s32 %rd151, %r414, 2; add.s64 %rd153, %rd151, %rd9; shl.b64 %rd154, %rd153, 1; add.s64 %rd155, %rd37, %rd154; add.s64 %rd178, %rd155, 8192; shl.b32 %r353, %r414, 3; add.s32 %r355, %r229, %r353; add.s32 %r415, %r355, 16640; $L__BB0_97: ld.shared.v2.f32 {%f408, %f409}, [%r415+-16384]; // begin inline asm {.reg .f16 low,high; cvt.rn.f16.f32 low, %f408; cvt.rn.f16.f32 high, %f409; mov.b32 %r356, {low,high};} // end inline asm st.global.u32 [%rd178+-8192], %r356; ld.shared.v2.f32 {%f410, %f411}, [%r415+-8192]; // begin inline asm {.reg .f16 low,high; cvt.rn.f16.f32 low, %f410; cvt.rn.f16.f32 high, %f411; mov.b32 %r357, {low,high};} // end inline asm st.global.u32 [%rd178+-4096], %r357; ld.shared.v2.f32 {%f412, %f413}, [%r415]; // begin inline asm {.reg .f16 low,high; cvt.rn.f16.f32 low, %f412; cvt.rn.f16.f32 high, %f413; mov.b32 %r358, {low,high};} // end inline asm st.global.u32 [%rd178], %r358; ld.shared.v2.f32 {%f414, %f415}, [%r415+8192]; // begin inline asm {.reg .f16 low,high; cvt.rn.f16.f32 low, %f414; cvt.rn.f16.f32 high, %f415; mov.b32 %r359, {low,high};} // end inline asm st.global.u32 [%rd178+4096], %r359; add.s64 %rd178, %rd178, 16384; add.s32 %r415, %r415, 32768; add.s32 %r414, %r414, 4096; setp.lt.s32 %p127, %r414, %r22; @%p127 bra $L__BB0_97; $L__BB0_105: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }