L_cac5c90d_20_genmmha_fused_sbh_cu_7d8278296thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN51_INTERNAL_cac5c90d_20_genmmha_fused_sbh_cu_7d8278296thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN51_INTERNAL_cac5c90d_20_genmmha_fused_sbh_cu_7d8278296thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN51_INTERNAL_cac5c90d_20_genmmha_fused_sbh_cu_7d8278296thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN51_INTERNAL_cac5c90d_20_genmmha_fused_sbh_cu_7d8278296thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN51_INTERNAL_cac5c90d_20_genmmha_fused_sbh_cu_7d8278296thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN51_INTERNAL_cac5c90d_20_genmmha_fused_sbh_cu_7d8278296thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN51_INTERNAL_cac5c90d_20_genmmha_fused_sbh_cu_7d8278296thrust12placeholders3_10E[1]; .extern .shared .align 16 .b8 smem[]; .visible .entry MultiHeadAttention_sbh_kernel( .param .align 8 .b8 MultiHeadAttention_sbh_kernel_param_0[144] ) { .reg .pred %p<133>; .reg .b16 %rs<106>; .reg .f32 %f<481>; .reg .b32 %r<390>; .reg .b64 %rd<135>; mov.b64 %rd38, MultiHeadAttention_sbh_kernel_param_0; mov.u64 %rd1, %rd38; ld.param.u64 %rd3, [MultiHeadAttention_sbh_kernel_param_0+24]; ld.param.u64 %rd4, [MultiHeadAttention_sbh_kernel_param_0+32]; ld.param.u64 %rd6, [MultiHeadAttention_sbh_kernel_param_0+8]; ld.param.u64 %rd39, [MultiHeadAttention_sbh_kernel_param_0+16]; cvta.to.global.u64 %rd7, %rd39; ld.param.u64 %rd8, [MultiHeadAttention_sbh_kernel_param_0+48]; ld.param.u64 %rd9, [MultiHeadAttention_sbh_kernel_param_0+56]; ld.param.v2.u32 {%r123, %r124}, [MultiHeadAttention_sbh_kernel_param_0+112]; ld.param.u32 %r4, [MultiHeadAttention_sbh_kernel_param_0+124]; mov.u32 %r5, %ctaid.x; mov.u32 %r6, %ctaid.y; ld.param.u64 %rd40, [MultiHeadAttention_sbh_kernel_param_0+80]; setp.eq.s64 %p1, %rd40, 0; mov.u32 %r355, %r123; @%p1 bra $L__BB0_2; ld.param.u64 %rd41, [%rd1+80]; cvta.to.global.u64 %rd42, %rd41; mul.wide.s32 %rd43, %r6, 4; add.s64 %rd44, %rd42, %rd43; ld.global.u32 %r125, [%rd44]; add.s32 %r355, %r125, 1; $L__BB0_2: add.s32 %r126, %r355, -1; rem.s32 %r9, %r126, %r123; min.s32 %r10, %r355, %r123; mad.lo.s32 %r127, %r124, %r6, %r5; mul.lo.s32 %r128, %r127, %r4; cvta.to.global.u64 %rd45, %rd3; mul.wide.s32 %rd46, %r128, 2; add.s64 %rd10, %rd45, %rd46; shl.b32 %r11, %r4, 5; add.s32 %r12, %r11, 128; ld.param.u32 %r129, [%rd1+120]; div.s32 %r13, %r124, %r129; div.s32 %r130, %r5, %r13; mad.lo.s32 %r131, %r129, %r6, %r130; mul.lo.s32 %r14, %r131, %r4; mul.lo.s32 %r15, %r14, %r123; shr.u32 %r132, %r4, 31; add.s32 %r133, %r4, %r132; shr.s32 %r16, %r133, 1; mov.u32 %r17, %tid.x; setp.ge.s32 %p2, %r17, %r16; @%p2 bra $L__BB0_9; not.b32 %r134, %r17; add.s32 %r18, %r16, %r134; shr.u32 %r135, %r18, 10; add.s32 %r136, %r135, 1; and.b32 %r357, %r136, 3; setp.eq.s32 %p3, %r357, 0; mov.u32 %r358, %r17; @%p3 bra $L__BB0_6; mov.u32 %r358, %r17; $L__BB0_5: .pragma "nounroll"; shl.b32 %r137, %r358, 2; mov.u32 %r138, smem; add.s32 %r139, %r138, %r137; mul.wide.s32 %rd47, %r358, 4; add.s64 %rd48, %rd10, %rd47; ld.global.u32 %r140, [%rd48]; st.shared.u32 [%r139+256], %r140; add.s32 %r358, %r358, 1024; add.s32 %r357, %r357, -1; setp.ne.s32 %p4, %r357, 0; @%p4 bra $L__BB0_5; $L__BB0_6: setp.lt.u32 %p5, %r18, 3072; @%p5 bra $L__BB0_9; mov.u32 %r143, smem; $L__BB0_8: mul.wide.s32 %rd49, %r358, 4; add.s64 %rd50, %rd10, %rd49; ld.global.u32 %r141, [%rd50]; shl.b32 %r142, %r358, 2; add.s32 %r144, %r143, %r142; st.shared.u32 [%r144+256], %r141; ld.global.u32 %r145, [%rd50+4096]; st.shared.u32 [%r144+4352], %r145; ld.global.u32 %r146, [%rd50+8192]; st.shared.u32 [%r144+8448], %r146; ld.global.u32 %r147, [%rd50+12288]; st.shared.u32 [%r144+12544], %r147; add.s32 %r358, %r358, 4096; setp.lt.s32 %p6, %r358, %r16; @%p6 bra $L__BB0_8; $L__BB0_9: bar.sync 0; mov.u32 %r27, WARP_SZ; div.u32 %r382, %r17, %r27; mul.lo.s32 %r148, %r382, %r27; sub.s32 %r29, %r17, %r148; setp.ge.s32 %p7, %r382, %r10; @%p7 bra $L__BB0_27; cvt.s64.s32 %rd11, %r15; cvta.to.global.u64 %rd51, %rd4; mul.wide.s32 %rd52, %r14, 2; add.s64 %rd12, %rd51, %rd52; shr.s32 %r149, %r4, 31; shr.u32 %r150, %r149, 30; add.s32 %r151, %r4, %r150; shr.s32 %r30, %r151, 2; ld.param.v2.f32 {%f97, %f98}, [%rd1+128]; mad.lo.s32 %r152, %r4, %r6, %r5; mul.lo.s32 %r31, %r152, %r10; mul.lo.s32 %r32, %r10, %r6; cvta.to.global.u64 %rd13, %rd9; cvta.to.global.u64 %rd14, %rd8; cvta.to.global.u64 %rd15, %rd6; mov.u32 %r360, %r382; $L__BB0_11: setp.ge.s32 %p8, %r29, %r30; mov.f32 %f442, 0f00000000; @%p8 bra $L__BB0_17; mul.lo.s32 %r153, %r360, %r4; cvt.s64.s32 %rd53, %r153; add.s64 %rd54, %rd53, %rd11; shl.b64 %rd55, %rd54, 1; add.s64 %rd16, %rd15, %rd55; setp.eq.s32 %p9, %r360, %r9; selp.b64 %rd17, %rd12, %rd16, %p9; mov.u32 %r361, %r29; $L__BB0_13: cvt.s64.s32 %rd18, %r361; mul.wide.s32 %rd56, %r361, 8; add.s64 %rd57, %rd17, %rd56; ld.global.v4.u16 {%rs33, %rs34, %rs35, %rs36}, [%rd57]; setp.ne.s32 %p10, %r360, %r9; @%p10 bra $L__BB0_16; rem.s32 %r154, %r5, %r13; setp.ne.s32 %p11, %r154, 0; @%p11 bra $L__BB0_16; shl.b64 %rd58, %rd18, 3; add.s64 %rd59, %rd16, %rd58; st.global.v4.u16 [%rd59], {%rs33, %rs34, %rs35, %rs36}; $L__BB0_16: cvt.u32.u64 %r155, %rd18; shl.b32 %r156, %r155, 3; mov.u32 %r157, smem; add.s32 %r158, %r157, %r156; ld.shared.v4.u16 {%rs45, %rs46, %rs47, %rs48}, [%r158+256]; // begin inline asm { cvt.f32.f16 %f101, %rs45;} // end inline asm // begin inline asm { cvt.f32.f16 %f102, %rs33;} // end inline asm // begin inline asm { cvt.f32.f16 %f103, %rs46;} // end inline asm // begin inline asm { cvt.f32.f16 %f104, %rs34;} // end inline asm mul.ftz.f32 %f109, %f103, %f104; fma.rn.ftz.f32 %f110, %f101, %f102, %f109; // begin inline asm { cvt.f32.f16 %f105, %rs47;} // end inline asm // begin inline asm { cvt.f32.f16 %f106, %rs35;} // end inline asm fma.rn.ftz.f32 %f111, %f105, %f106, %f110; // begin inline asm { cvt.f32.f16 %f107, %rs48;} // end inline asm // begin inline asm { cvt.f32.f16 %f108, %rs36;} // end inline asm fma.rn.ftz.f32 %f112, %f107, %f108, %f111; add.ftz.f32 %f442, %f442, %f112; add.s32 %r361, %r155, %r27; setp.lt.s32 %p12, %r361, %r30; @%p12 bra $L__BB0_13; $L__BB0_17: bar.warp.sync -1; setp.lt.s32 %p13, %r27, 2; @%p13 bra $L__BB0_20; mov.u32 %r362, %r27; $L__BB0_19: mov.b32 %r159, %f442; shr.u32 %r160, %r362, 31; add.s32 %r161, %r362, %r160; shr.s32 %r37, %r161, 1; mov.u32 %r162, 31; mov.u32 %r163, -1; shfl.sync.down.b32 %r164|%p14, %r159, %r37, %r162, %r163; mov.b32 %f113, %r164; add.ftz.f32 %f442, %f442, %f113; setp.gt.s32 %p15, %r362, 3; mov.u32 %r362, %r37; @%p15 bra $L__BB0_19; $L__BB0_20: setp.ne.s32 %p16, %r29, 0; @%p16 bra $L__BB0_26; setp.eq.s64 %p17, %rd9, 0; mul.ftz.f32 %f443, %f442, %f97; @%p17 bra $L__BB0_23; add.s32 %r165, %r360, %r31; mul.wide.s32 %rd60, %r165, 2; add.s64 %rd61, %rd13, %rd60; ld.global.u16 %rs49, [%rd61]; // begin inline asm { cvt.f32.f16 %f114, %rs49;} // end inline asm add.ftz.f32 %f443, %f443, %f114; bra.uni $L__BB0_25; $L__BB0_23: setp.eq.s64 %p18, %rd8, 0; @%p18 bra $L__BB0_25; add.s32 %r166, %r360, %r32; mul.wide.s32 %rd62, %r166, 4; add.s64 %rd63, %rd14, %rd62; ld.global.u32 %r167, [%rd63]; setp.eq.s32 %p19, %r167, 0; selp.f32 %f115, %f98, 0f00000000, %p19; add.ftz.f32 %f443, %f443, %f115; $L__BB0_25: add.s32 %r168, %r360, %r12; shl.b32 %r169, %r168, 2; mov.u32 %r170, smem; add.s32 %r171, %r170, %r169; st.shared.f32 [%r171+256], %f443; $L__BB0_26: add.s32 %r360, %r360, 32; setp.lt.s32 %p20, %r360, %r10; @%p20 bra $L__BB0_11; $L__BB0_27: bar.sync 0; mov.u32 %r39, %ntid.x; setp.ge.s32 %p21, %r17, %r10; mov.f32 %f449, 0fFF7FFFFF; @%p21 bra $L__BB0_29; add.s32 %r172, %r12, %r17; shl.b32 %r173, %r172, 2; mov.u32 %r174, smem; add.s32 %r175, %r174, %r173; ld.shared.f32 %f449, [%r175+256]; $L__BB0_29: add.s32 %r365, %r17, %r39; setp.ge.s32 %p22, %r365, %r10; @%p22 bra $L__BB0_36; not.b32 %r176, %r10; add.s32 %r177, %r365, %r176; mov.u32 %r178, -2; sub.s32 %r179, %r178, %r177; div.u32 %r41, %r179, %r39; add.s32 %r180, %r41, 1; and.b32 %r364, %r180, 3; setp.eq.s32 %p23, %r364, 0; @%p23 bra $L__BB0_33; $L__BB0_32: .pragma "nounroll"; add.s32 %r181, %r365, %r12; shl.b32 %r182, %r181, 2; mov.u32 %r183, smem; add.s32 %r184, %r183, %r182; ld.shared.f32 %f118, [%r184+256]; setp.gt.ftz.f32 %p24, %f118, %f449; selp.f32 %f449, %f118, %f449, %p24; add.s32 %r365, %r365, %r39; add.s32 %r364, %r364, -1; setp.ne.s32 %p25, %r364, 0; @%p25 bra $L__BB0_32; $L__BB0_33: setp.lt.u32 %p26, %r41, 3; @%p26 bra $L__BB0_36; shl.b32 %r185, %r4, 7; shl.b32 %r186, %r365, 2; add.s32 %r187, %r185, %r186; mov.u32 %r188, smem; add.s32 %r189, %r188, %r187; add.s32 %r367, %r189, 768; shl.b32 %r49, %r39, 2; $L__BB0_35: ld.shared.f32 %f119, [%r367]; setp.gt.ftz.f32 %p27, %f119, %f449; selp.f32 %f120, %f119, %f449, %p27; add.s32 %r190, %r367, %r49; ld.shared.f32 %f121, [%r190]; setp.gt.ftz.f32 %p28, %f121, %f120; selp.f32 %f122, %f121, %f120, %p28; add.s32 %r191, %r365, %r39; add.s32 %r192, %r191, %r39; add.s32 %r193, %r190, %r49; ld.shared.f32 %f123, [%r193]; setp.gt.ftz.f32 %p29, %f123, %f122; selp.f32 %f124, %f123, %f122, %p29; add.s32 %r194, %r192, %r39; add.s32 %r195, %r193, %r49; add.s32 %r367, %r195, %r49; ld.shared.f32 %f125, [%r195]; setp.gt.ftz.f32 %p30, %f125, %f124; selp.f32 %f449, %f125, %f124, %p30; add.s32 %r365, %r194, %r39; setp.lt.s32 %p31, %r365, %r10; @%p31 bra $L__BB0_35; $L__BB0_36: // begin inline asm mov.u32 %r196, %laneid; // end inline asm // begin inline asm mov.u32 %r197, %laneid; // end inline asm mov.b32 %r199, %f449; mov.u32 %r200, 1; mov.u32 %r221, 31; mov.u32 %r222, -1; // begin inline asm shfl.sync.down.b32 %r198, %r199, %r200, %r221, %r222; // end inline asm mov.b32 %f126, %r198; setp.lt.s32 %p32, %r197, 31; setp.lt.ftz.f32 %p33, %f449, %f126; and.pred %p34, %p32, %p33; selp.f32 %f127, %f126, %f449, %p34; mov.b32 %r204, %f127; mov.u32 %r205, 2; // begin inline asm shfl.sync.down.b32 %r203, %r204, %r205, %r221, %r222; // end inline asm mov.b32 %f128, %r203; setp.lt.s32 %p35, %r197, 30; setp.lt.ftz.f32 %p36, %f127, %f128; and.pred %p37, %p35, %p36; selp.f32 %f129, %f128, %f127, %p37; mov.b32 %r209, %f129; mov.u32 %r210, 4; // begin inline asm shfl.sync.down.b32 %r208, %r209, %r210, %r221, %r222; // end inline asm mov.b32 %f130, %r208; setp.lt.s32 %p38, %r197, 28; setp.lt.ftz.f32 %p39, %f129, %f130; and.pred %p40, %p38, %p39; selp.f32 %f131, %f130, %f129, %p40; mov.b32 %r214, %f131; mov.u32 %r215, 8; // begin inline asm shfl.sync.down.b32 %r213, %r214, %r215, %r221, %r222; // end inline asm mov.b32 %f132, %r213; setp.lt.s32 %p41, %r197, 24; setp.lt.ftz.f32 %p42, %f131, %f132; and.pred %p43, %p41, %p42; selp.f32 %f133, %f132, %f131, %p43; mov.b32 %r219, %f133; mov.u32 %r220, 16; // begin inline asm shfl.sync.down.b32 %r218, %r219, %r220, %r221, %r222; // end inline asm mov.b32 %f134, %r218; setp.lt.s32 %p44, %r197, 16; setp.lt.ftz.f32 %p45, %f133, %f134; and.pred %p46, %p44, %p45; selp.f32 %f450, %f134, %f133, %p46; setp.ne.s32 %p47, %r196, 0; shr.s32 %r223, %r17, 31; shr.u32 %r224, %r223, 27; add.s32 %r225, %r17, %r224; shr.s32 %r226, %r225, 5; shl.b32 %r227, %r226, 2; mov.u32 %r228, smem; add.s32 %r229, %r228, %r227; @%p47 bra $L__BB0_38; add.s32 %r349, %r229, 32; st.shared.f32 [%r349], %f450; $L__BB0_38: bar.sync 0; setp.ne.s32 %p48, %r17, 0; @%p48 bra $L__BB0_40; ld.shared.f32 %f135, [smem+36]; setp.gt.ftz.f32 %p49, %f135, %f450; selp.f32 %f136, %f135, %f450, %p49; ld.shared.v2.f32 {%f137, %f138}, [smem+40]; setp.gt.ftz.f32 %p50, %f137, %f136; selp.f32 %f141, %f137, %f136, %p50; setp.gt.ftz.f32 %p51, %f138, %f141; selp.f32 %f142, %f138, %f141, %p51; ld.shared.v4.f32 {%f143, %f144, %f145, %f146}, [smem+48]; setp.gt.ftz.f32 %p52, %f143, %f142; selp.f32 %f151, %f143, %f142, %p52; setp.gt.ftz.f32 %p53, %f144, %f151; selp.f32 %f152, %f144, %f151, %p53; setp.gt.ftz.f32 %p54, %f145, %f152; selp.f32 %f153, %f145, %f152, %p54; setp.gt.ftz.f32 %p55, %f146, %f153; selp.f32 %f154, %f146, %f153, %p55; ld.shared.v4.f32 {%f155, %f156, %f157, %f158}, [smem+64]; setp.gt.ftz.f32 %p56, %f155, %f154; selp.f32 %f163, %f155, %f154, %p56; setp.gt.ftz.f32 %p57, %f156, %f163; selp.f32 %f164, %f156, %f163, %p57; setp.gt.ftz.f32 %p58, %f157, %f164; selp.f32 %f165, %f157, %f164, %p58; setp.gt.ftz.f32 %p59, %f158, %f165; selp.f32 %f166, %f158, %f165, %p59; ld.shared.v4.f32 {%f167, %f168, %f169, %f170}, [smem+80]; setp.gt.ftz.f32 %p60, %f167, %f166; selp.f32 %f175, %f167, %f166, %p60; setp.gt.ftz.f32 %p61, %f168, %f175; selp.f32 %f176, %f168, %f175, %p61; setp.gt.ftz.f32 %p62, %f169, %f176; selp.f32 %f177, %f169, %f176, %p62; setp.gt.ftz.f32 %p63, %f170, %f177; selp.f32 %f178, %f170, %f177, %p63; ld.shared.v4.f32 {%f179, %f180, %f181, %f182}, [smem+96]; setp.gt.ftz.f32 %p64, %f179, %f178; selp.f32 %f187, %f179, %f178, %p64; setp.gt.ftz.f32 %p65, %f180, %f187; selp.f32 %f188, %f180, %f187, %p65; setp.gt.ftz.f32 %p66, %f181, %f188; selp.f32 %f189, %f181, %f188, %p66; setp.gt.ftz.f32 %p67, %f182, %f189; selp.f32 %f190, %f182, %f189, %p67; ld.shared.v4.f32 {%f191, %f192, %f193, %f194}, [smem+112]; setp.gt.ftz.f32 %p68, %f191, %f190; selp.f32 %f199, %f191, %f190, %p68; setp.gt.ftz.f32 %p69, %f192, %f199; selp.f32 %f200, %f192, %f199, %p69; setp.gt.ftz.f32 %p70, %f193, %f200; selp.f32 %f201, %f193, %f200, %p70; setp.gt.ftz.f32 %p71, %f194, %f201; selp.f32 %f202, %f194, %f201, %p71; ld.shared.v4.f32 {%f203, %f204, %f205, %f206}, [smem+128]; setp.gt.ftz.f32 %p72, %f203, %f202; selp.f32 %f211, %f203, %f202, %p72; setp.gt.ftz.f32 %p73, %f204, %f211; selp.f32 %f212, %f204, %f211, %p73; setp.gt.ftz.f32 %p74, %f205, %f212; selp.f32 %f213, %f205, %f212, %p74; setp.gt.ftz.f32 %p75, %f206, %f213; selp.f32 %f214, %f206, %f213, %p75; ld.shared.v4.f32 {%f215, %f216, %f217, %f218}, [smem+144]; setp.gt.ftz.f32 %p76, %f215, %f214; selp.f32 %f223, %f215, %f214, %p76; setp.gt.ftz.f32 %p77, %f216, %f223; selp.f32 %f224, %f216, %f223, %p77; setp.gt.ftz.f32 %p78, %f217, %f224; selp.f32 %f225, %f217, %f224, %p78; setp.gt.ftz.f32 %p79, %f218, %f225; selp.f32 %f450, %f218, %f225, %p79; $L__BB0_40: @%p48 bra $L__BB0_42; st.shared.f32 [smem+164], %f450; $L__BB0_42: mov.u32 %r351, %tid.x; setp.ge.s32 %p132, %r351, %r10; bar.sync 0; mov.f32 %f455, 0f00000000; ld.shared.f32 %f25, [smem+164]; @%p132 bra $L__BB0_49; mov.u32 %r370, %tid.x; not.b32 %r230, %r10; add.s32 %r231, %r370, %r230; mov.u32 %r232, -2; sub.s32 %r233, %r232, %r231; div.u32 %r55, %r233, %r39; add.s32 %r234, %r55, 1; and.b32 %r369, %r234, 3; setp.eq.s32 %p82, %r369, 0; mov.f32 %f455, 0f00000000; @%p82 bra $L__BB0_46; mov.u32 %r370, %tid.x; $L__BB0_45: .pragma "nounroll"; add.s32 %r235, %r370, %r12; shl.b32 %r236, %r235, 2; add.s32 %r238, %r228, %r236; ld.shared.f32 %f230, [%r238+256]; sub.ftz.f32 %f231, %f230, %f25; mul.ftz.f32 %f232, %f231, 0f3FB8AA3B; ex2.approx.ftz.f32 %f233, %f232; add.ftz.f32 %f455, %f455, %f233; st.shared.f32 [%r238+256], %f233; add.s32 %r370, %r370, %r39; add.s32 %r369, %r369, -1; setp.ne.s32 %p83, %r369, 0; @%p83 bra $L__BB0_45; $L__BB0_46: setp.lt.u32 %p84, %r55, 3; @%p84 bra $L__BB0_49; shl.b32 %r239, %r4, 7; shl.b32 %r240, %r370, 2; add.s32 %r241, %r239, %r240; add.s32 %r243, %r228, %r241; add.s32 %r372, %r243, 768; shl.b32 %r63, %r39, 2; $L__BB0_48: ld.shared.f32 %f234, [%r372]; sub.ftz.f32 %f235, %f234, %f25; mul.ftz.f32 %f236, %f235, 0f3FB8AA3B; ex2.approx.ftz.f32 %f237, %f236; add.ftz.f32 %f238, %f455, %f237; st.shared.f32 [%r372], %f237; add.s32 %r244, %r372, %r63; ld.shared.f32 %f239, [%r244]; sub.ftz.f32 %f240, %f239, %f25; mul.ftz.f32 %f241, %f240, 0f3FB8AA3B; ex2.approx.ftz.f32 %f242, %f241; add.ftz.f32 %f243, %f238, %f242; st.shared.f32 [%r244], %f242; add.s32 %r245, %r370, %r39; add.s32 %r246, %r245, %r39; add.s32 %r247, %r244, %r63; ld.shared.f32 %f244, [%r247]; sub.ftz.f32 %f245, %f244, %f25; mul.ftz.f32 %f246, %f245, 0f3FB8AA3B; ex2.approx.ftz.f32 %f247, %f246; add.ftz.f32 %f248, %f243, %f247; st.shared.f32 [%r247], %f247; add.s32 %r248, %r246, %r39; add.s32 %r249, %r247, %r63; add.s32 %r372, %r249, %r63; ld.shared.f32 %f249, [%r249]; sub.ftz.f32 %f250, %f249, %f25; mul.ftz.f32 %f251, %f250, 0f3FB8AA3B; ex2.approx.ftz.f32 %f252, %f251; add.ftz.f32 %f455, %f248, %f252; st.shared.f32 [%r249], %f252; add.s32 %r370, %r248, %r39; setp.lt.s32 %p85, %r370, %r10; @%p85 bra $L__BB0_48; $L__BB0_49: bar.sync 0; // begin inline asm mov.u32 %r250, %laneid; // end inline asm mov.u32 %r252, 1; mov.u32 %r265, 31; mov.u32 %r266, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f455, %r252, %r265, %r266; @p add.f32 r0, r0, %f455; mov.f32 %f253, r0;} // end inline asm mov.u32 %r255, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f253, %r255, %r265, %r266; @p add.f32 r0, r0, %f253; mov.f32 %f256, r0;} // end inline asm mov.u32 %r258, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f256, %r258, %r265, %r266; @p add.f32 r0, r0, %f256; mov.f32 %f259, r0;} // end inline asm mov.u32 %r261, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f259, %r261, %r265, %r266; @p add.f32 r0, r0, %f259; mov.f32 %f262, r0;} // end inline asm mov.u32 %r264, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f262, %r264, %r265, %r266; @p add.f32 r0, r0, %f262; mov.f32 %f456, r0;} // end inline asm setp.ne.s32 %p86, %r250, 0; @%p86 bra $L__BB0_51; add.s32 %r350, %r229, 32; st.shared.f32 [%r350], %f456; $L__BB0_51: bar.sync 0; @%p48 bra $L__BB0_53; ld.shared.f32 %f268, [smem+36]; add.ftz.f32 %f269, %f456, %f268; ld.shared.v2.f32 {%f270, %f271}, [smem+40]; add.ftz.f32 %f274, %f269, %f270; add.ftz.f32 %f275, %f274, %f271; ld.shared.v4.f32 {%f276, %f277, %f278, %f279}, [smem+48]; add.ftz.f32 %f284, %f275, %f276; add.ftz.f32 %f285, %f284, %f277; add.ftz.f32 %f286, %f285, %f278; add.ftz.f32 %f287, %f286, %f279; ld.shared.v4.f32 {%f288, %f289, %f290, %f291}, [smem+64]; add.ftz.f32 %f296, %f287, %f288; add.ftz.f32 %f297, %f296, %f289; add.ftz.f32 %f298, %f297, %f290; add.ftz.f32 %f299, %f298, %f291; ld.shared.v4.f32 {%f300, %f301, %f302, %f303}, [smem+80]; add.ftz.f32 %f308, %f299, %f300; add.ftz.f32 %f309, %f308, %f301; add.ftz.f32 %f310, %f309, %f302; add.ftz.f32 %f311, %f310, %f303; ld.shared.v4.f32 {%f312, %f313, %f314, %f315}, [smem+96]; add.ftz.f32 %f320, %f311, %f312; add.ftz.f32 %f321, %f320, %f313; add.ftz.f32 %f322, %f321, %f314; add.ftz.f32 %f323, %f322, %f315; ld.shared.v4.f32 {%f324, %f325, %f326, %f327}, [smem+112]; add.ftz.f32 %f332, %f323, %f324; add.ftz.f32 %f333, %f332, %f325; add.ftz.f32 %f334, %f333, %f326; add.ftz.f32 %f335, %f334, %f327; ld.shared.v4.f32 {%f336, %f337, %f338, %f339}, [smem+128]; add.ftz.f32 %f344, %f335, %f336; add.ftz.f32 %f345, %f344, %f337; add.ftz.f32 %f346, %f345, %f338; add.ftz.f32 %f347, %f346, %f339; ld.shared.v4.f32 {%f348, %f349, %f350, %f351}, [smem+144]; add.ftz.f32 %f356, %f347, %f348; add.ftz.f32 %f357, %f356, %f349; add.ftz.f32 %f358, %f357, %f350; add.ftz.f32 %f456, %f358, %f351; $L__BB0_53: @%p48 bra $L__BB0_55; st.shared.f32 [smem+164], %f456; $L__BB0_55: bar.sync 0; ld.shared.f32 %f359, [smem+164]; rcp.approx.ftz.f32 %f36, %f359; shr.s32 %r267, %r4, 31; shr.u32 %r268, %r267, 30; add.s32 %r269, %r4, %r268; shr.s32 %r68, %r269, 2; setp.ge.s32 %p89, %r29, %r68; @%p89 bra $L__BB0_87; shl.b32 %r352, %r4, 5; ld.param.u64 %rd132, [MultiHeadAttention_sbh_kernel_param_0+40]; add.s32 %r270, %r4, 4; mul.lo.s32 %r271, %r382, %r270; shl.b32 %r272, %r271, 2; add.s32 %r274, %r228, 256; add.s32 %r69, %r274, %r272; cvta.to.global.u64 %rd64, %rd132; mul.wide.s32 %rd65, %r14, 2; add.s64 %rd19, %rd64, %rd65; not.b32 %r275, %r382; add.s32 %r70, %r10, %r275; shr.u32 %r276, %r70, 5; add.s32 %r277, %r276, 1; and.b32 %r71, %r277, 3; mul.lo.s32 %r278, %r382, %r4; cvt.s64.s32 %rd66, %r278; cvt.s64.s32 %rd20, %r15; add.s64 %rd67, %rd66, %rd20; shl.b64 %rd68, %rd67, 1; add.s64 %rd21, %rd7, %rd68; setp.eq.s32 %p90, %r382, %r9; selp.b64 %rd22, %rd19, %rd21, %p90; add.s32 %r279, %r382, %r12; shl.b32 %r280, %r279, 2; add.s32 %r72, %r274, %r280; add.s32 %r281, %r278, %r352; cvt.s64.s32 %rd69, %r281; add.s64 %rd70, %rd69, %rd20; shl.b64 %rd71, %rd70, 1; add.s64 %rd23, %rd7, %rd71; add.s32 %r282, %r382, 32; setp.eq.s32 %p91, %r282, %r9; selp.b64 %rd24, %rd19, %rd23, %p91; add.s32 %r283, %r281, %r352; cvt.s64.s32 %rd72, %r283; add.s64 %rd73, %rd72, %rd20; shl.b64 %rd74, %rd73, 1; add.s64 %rd25, %rd7, %rd74; add.s32 %r284, %r382, 64; setp.eq.s32 %p92, %r284, %r9; selp.b64 %rd26, %rd19, %rd25, %p92; shl.b32 %r73, %r4, 7; add.s32 %r285, %r228, %r73; add.s32 %r74, %r285, 768; mov.u32 %r373, %r29; $L__BB0_57: mov.f32 %f469, 0f00000000; mov.f32 %f470, %f469; mov.f32 %f471, %f469; mov.f32 %f472, %f469; @%p7 bra $L__BB0_86; setp.eq.s32 %p94, %r71, 0; cvt.s64.s32 %rd27, %r373; mov.f32 %f472, 0f00000000; mov.u32 %r374, %r382; mov.f32 %f471, %f472; mov.f32 %f470, %f472; mov.f32 %f469, %f472; @%p94 bra $L__BB0_71; shl.b64 %rd75, %rd27, 3; add.s64 %rd76, %rd22, %rd75; ld.global.v4.u16 {%rs50, %rs51, %rs52, %rs53}, [%rd76]; setp.ne.s32 %p95, %r382, %r9; @%p95 bra $L__BB0_62; rem.s32 %r286, %r5, %r13; setp.ne.s32 %p96, %r286, 0; @%p96 bra $L__BB0_62; add.s64 %rd78, %rd21, %rd75; st.global.v4.u16 [%rd78], {%rs50, %rs51, %rs52, %rs53}; $L__BB0_62: setp.eq.s32 %p97, %r71, 1; ld.shared.f32 %f373, [%r72]; mul.ftz.f32 %f374, %f36, %f373; // begin inline asm { cvt.f32.f16 %f369, %rs50;} // end inline asm fma.rn.ftz.f32 %f472, %f374, %f369, 0f00000000; // begin inline asm { cvt.f32.f16 %f370, %rs51;} // end inline asm fma.rn.ftz.f32 %f471, %f374, %f370, 0f00000000; // begin inline asm { cvt.f32.f16 %f371, %rs52;} // end inline asm fma.rn.ftz.f32 %f470, %f374, %f371, 0f00000000; // begin inline asm { cvt.f32.f16 %f372, %rs53;} // end inline asm fma.rn.ftz.f32 %f469, %f374, %f372, 0f00000000; mov.u32 %r374, %r282; @%p97 bra $L__BB0_71; add.s32 %r287, %r382, 32; setp.ne.s32 %p98, %r287, %r9; add.s64 %rd80, %rd24, %rd75; ld.global.v4.u16 {%rs58, %rs59, %rs60, %rs61}, [%rd80]; @%p98 bra $L__BB0_66; rem.s32 %r288, %r5, %r13; setp.ne.s32 %p99, %r288, 0; @%p99 bra $L__BB0_66; add.s64 %rd82, %rd23, %rd75; st.global.v4.u16 [%rd82], {%rs58, %rs59, %rs60, %rs61}; $L__BB0_66: setp.eq.s32 %p100, %r71, 2; ld.shared.f32 %f379, [%r72+128]; mul.ftz.f32 %f380, %f36, %f379; // begin inline asm { cvt.f32.f16 %f375, %rs58;} // end inline asm fma.rn.ftz.f32 %f472, %f380, %f375, %f472; // begin inline asm { cvt.f32.f16 %f376, %rs59;} // end inline asm fma.rn.ftz.f32 %f471, %f380, %f376, %f471; // begin inline asm { cvt.f32.f16 %f377, %rs60;} // end inline asm fma.rn.ftz.f32 %f470, %f380, %f377, %f470; // begin inline asm { cvt.f32.f16 %f378, %rs61;} // end inline asm fma.rn.ftz.f32 %f469, %f380, %f378, %f469; mov.u32 %r374, %r284; @%p100 bra $L__BB0_71; add.s32 %r289, %r382, 64; setp.ne.s32 %p101, %r289, %r9; add.s64 %rd84, %rd26, %rd75; ld.global.v4.u16 {%rs66, %rs67, %rs68, %rs69}, [%rd84]; @%p101 bra $L__BB0_70; rem.s32 %r290, %r5, %r13; setp.ne.s32 %p102, %r290, 0; @%p102 bra $L__BB0_70; add.s64 %rd86, %rd25, %rd75; st.global.v4.u16 [%rd86], {%rs66, %rs67, %rs68, %rs69}; $L__BB0_70: ld.shared.f32 %f385, [%r72+256]; mul.ftz.f32 %f386, %f36, %f385; // begin inline asm { cvt.f32.f16 %f381, %rs66;} // end inline asm fma.rn.ftz.f32 %f472, %f386, %f381, %f472; // begin inline asm { cvt.f32.f16 %f382, %rs67;} // end inline asm fma.rn.ftz.f32 %f471, %f386, %f382, %f471; // begin inline asm { cvt.f32.f16 %f383, %rs68;} // end inline asm fma.rn.ftz.f32 %f470, %f386, %f383, %f470; // begin inline asm { cvt.f32.f16 %f384, %rs69;} // end inline asm fma.rn.ftz.f32 %f469, %f386, %f384, %f469; add.s32 %r374, %r382, 96; $L__BB0_71: setp.lt.u32 %p103, %r70, 96; @%p103 bra $L__BB0_86; shl.b32 %r291, %r374, 2; mul.lo.s32 %r379, %r4, %r374; sub.s32 %r378, %r374, %r9; add.s32 %r292, %r374, 96; mul.lo.s32 %r377, %r4, %r292; add.s32 %r293, %r374, 32; mul.lo.s32 %r376, %r4, %r293; add.s32 %r294, %r374, 64; mul.lo.s32 %r375, %r4, %r294; add.s32 %r381, %r74, %r291; $L__BB0_73: mov.u32 %r92, %r381; cvt.s64.s32 %rd87, %r379; add.s64 %rd88, %rd87, %rd20; shl.b64 %rd89, %rd88, 1; add.s64 %rd28, %rd7, %rd89; setp.ne.s32 %p104, %r378, 0; setp.eq.s32 %p105, %r378, 0; selp.b64 %rd90, %rd19, %rd28, %p105; shl.b64 %rd91, %rd27, 3; add.s64 %rd92, %rd90, %rd91; ld.global.v4.u16 {%rs74, %rs75, %rs76, %rs77}, [%rd92]; @%p104 bra $L__BB0_76; rem.s32 %r295, %r5, %r13; setp.ne.s32 %p106, %r295, 0; @%p106 bra $L__BB0_76; add.s64 %rd94, %rd28, %rd91; st.global.v4.u16 [%rd94], {%rs74, %rs75, %rs76, %rs77}; $L__BB0_76: ld.shared.f32 %f391, [%r92]; mul.ftz.f32 %f392, %f36, %f391; // begin inline asm { cvt.f32.f16 %f387, %rs74;} // end inline asm fma.rn.ftz.f32 %f61, %f392, %f387, %f472; // begin inline asm { cvt.f32.f16 %f388, %rs75;} // end inline asm fma.rn.ftz.f32 %f62, %f392, %f388, %f471; // begin inline asm { cvt.f32.f16 %f389, %rs76;} // end inline asm fma.rn.ftz.f32 %f63, %f392, %f389, %f470; // begin inline asm { cvt.f32.f16 %f390, %rs77;} // end inline asm fma.rn.ftz.f32 %f64, %f392, %f390, %f469; cvt.s64.s32 %rd95, %r376; add.s64 %rd96, %rd95, %rd20; shl.b64 %rd97, %rd96, 1; add.s64 %rd29, %rd7, %rd97; add.s32 %r296, %r374, 32; setp.ne.s32 %p107, %r296, %r9; setp.eq.s32 %p108, %r296, %r9; selp.b64 %rd98, %rd19, %rd29, %p108; add.s64 %rd100, %rd98, %rd91; ld.global.v4.u16 {%rs82, %rs83, %rs84, %rs85}, [%rd100]; @%p107 bra $L__BB0_79; rem.s32 %r297, %r5, %r13; setp.ne.s32 %p109, %r297, 0; @%p109 bra $L__BB0_79; add.s64 %rd102, %rd29, %rd91; st.global.v4.u16 [%rd102], {%rs82, %rs83, %rs84, %rs85}; $L__BB0_79: ld.shared.f32 %f397, [%r92+128]; mul.ftz.f32 %f398, %f36, %f397; // begin inline asm { cvt.f32.f16 %f393, %rs82;} // end inline asm fma.rn.ftz.f32 %f65, %f398, %f393, %f61; // begin inline asm { cvt.f32.f16 %f394, %rs83;} // end inline asm fma.rn.ftz.f32 %f66, %f398, %f394, %f62; // begin inline asm { cvt.f32.f16 %f395, %rs84;} // end inline asm fma.rn.ftz.f32 %f67, %f398, %f395, %f63; // begin inline asm { cvt.f32.f16 %f396, %rs85;} // end inline asm fma.rn.ftz.f32 %f68, %f398, %f396, %f64; cvt.s64.s32 %rd103, %r375; add.s64 %rd104, %rd103, %rd20; shl.b64 %rd105, %rd104, 1; add.s64 %rd30, %rd7, %rd105; add.s32 %r298, %r374, 64; setp.ne.s32 %p110, %r298, %r9; setp.eq.s32 %p111, %r298, %r9; selp.b64 %rd106, %rd19, %rd30, %p111; add.s64 %rd108, %rd106, %rd91; ld.global.v4.u16 {%rs90, %rs91, %rs92, %rs93}, [%rd108]; @%p110 bra $L__BB0_82; rem.s32 %r299, %r5, %r13; setp.ne.s32 %p112, %r299, 0; @%p112 bra $L__BB0_82; add.s64 %rd110, %rd30, %rd91; st.global.v4.u16 [%rd110], {%rs90, %rs91, %rs92, %rs93}; $L__BB0_82: ld.shared.f32 %f403, [%r92+256]; mul.ftz.f32 %f404, %f36, %f403; // begin inline asm { cvt.f32.f16 %f399, %rs90;} // end inline asm fma.rn.ftz.f32 %f69, %f404, %f399, %f65; // begin inline asm { cvt.f32.f16 %f400, %rs91;} // end inline asm fma.rn.ftz.f32 %f70, %f404, %f400, %f66; // begin inline asm { cvt.f32.f16 %f401, %rs92;} // end inline asm fma.rn.ftz.f32 %f71, %f404, %f401, %f67; // begin inline asm { cvt.f32.f16 %f402, %rs93;} // end inline asm fma.rn.ftz.f32 %f72, %f404, %f402, %f68; cvt.s64.s32 %rd111, %r377; add.s64 %rd112, %rd111, %rd20; shl.b64 %rd113, %rd112, 1; add.s64 %rd31, %rd7, %rd113; add.s32 %r300, %r374, 96; setp.ne.s32 %p113, %r300, %r9; setp.eq.s32 %p114, %r300, %r9; selp.b64 %rd114, %rd19, %rd31, %p114; add.s64 %rd116, %rd114, %rd91; ld.global.v4.u16 {%rs98, %rs99, %rs100, %rs101}, [%rd116]; @%p113 bra $L__BB0_85; rem.s32 %r301, %r5, %r13; setp.ne.s32 %p115, %r301, 0; @%p115 bra $L__BB0_85; add.s64 %rd118, %rd31, %rd91; st.global.v4.u16 [%rd118], {%rs98, %rs99, %rs100, %rs101}; $L__BB0_85: add.s32 %r381, %r92, 512; ld.shared.f32 %f409, [%r92+384]; mul.ftz.f32 %f410, %f36, %f409; // begin inline asm { cvt.f32.f16 %f405, %rs98;} // end inline asm fma.rn.ftz.f32 %f472, %f410, %f405, %f69; // begin inline asm { cvt.f32.f16 %f406, %rs99;} // end inline asm fma.rn.ftz.f32 %f471, %f410, %f406, %f70; // begin inline asm { cvt.f32.f16 %f407, %rs100;} // end inline asm fma.rn.ftz.f32 %f470, %f410, %f407, %f71; // begin inline asm { cvt.f32.f16 %f408, %rs101;} // end inline asm fma.rn.ftz.f32 %f469, %f410, %f408, %f72; add.s32 %r379, %r379, %r73; add.s32 %r378, %r378, 128; add.s32 %r377, %r377, %r73; add.s32 %r376, %r376, %r73; add.s32 %r375, %r375, %r73; add.s32 %r374, %r374, 128; setp.lt.s32 %p116, %r374, %r10; @%p116 bra $L__BB0_73; $L__BB0_86: shl.b32 %r302, %r373, 4; add.s32 %r303, %r69, %r302; st.shared.v4.f32 [%r303], {%f472, %f471, %f470, %f469}; add.s32 %r373, %r373, %r27; setp.lt.s32 %p117, %r373, %r68; @%p117 bra $L__BB0_57; $L__BB0_87: bar.sync 0; setp.ge.s32 %p118, %r382, %r68; @%p118 bra $L__BB0_95; add.s32 %r304, %r4, 4; mul.lo.s32 %r305, %r29, %r304; shl.b32 %r306, %r305, 2; add.s32 %r308, %r228, %r306; add.s32 %r101, %r308, 256; $L__BB0_89: shl.b32 %r309, %r382, 4; add.s32 %r310, %r101, %r309; ld.shared.v4.f32 {%f477, %f478, %f479, %f480}, [%r310]; bar.warp.sync -1; setp.lt.s32 %p119, %r27, 2; @%p119 bra $L__BB0_92; mov.u32 %r383, %r27; $L__BB0_91: mov.b32 %r311, %f477; shr.u32 %r312, %r383, 31; add.s32 %r313, %r383, %r312; shr.s32 %r104, %r313, 1; mov.u32 %r314, 31; mov.u32 %r315, -1; shfl.sync.down.b32 %r316|%p120, %r311, %r104, %r314, %r315; mov.b32 %f415, %r316; add.ftz.f32 %f477, %f477, %f415; mov.b32 %r317, %f478; shfl.sync.down.b32 %r318|%p121, %r317, %r104, %r314, %r315; mov.b32 %f416, %r318; add.ftz.f32 %f478, %f478, %f416; mov.b32 %r319, %f479; shfl.sync.down.b32 %r320|%p122, %r319, %r104, %r314, %r315; mov.b32 %f417, %r320; add.ftz.f32 %f479, %f479, %f417; mov.b32 %r321, %f480; shfl.sync.down.b32 %r322|%p123, %r321, %r104, %r314, %r315; mov.b32 %f418, %r322; add.ftz.f32 %f480, %f480, %f418; setp.gt.s32 %p124, %r383, 3; mov.u32 %r383, %r104; @%p124 bra $L__BB0_91; $L__BB0_92: setp.ne.s32 %p125, %r29, 0; @%p125 bra $L__BB0_94; add.s32 %r325, %r228, %r309; st.shared.v4.f32 [%r325+256], {%f477, %f478, %f479, %f480}; $L__BB0_94: add.s32 %r382, %r382, 32; setp.lt.s32 %p126, %r382, %r68; @%p126 bra $L__BB0_89; $L__BB0_95: mov.u32 %r387, %tid.x; setp.ge.s32 %p127, %r387, %r16; bar.sync 0; @%p127 bra $L__BB0_102; not.b32 %r327, %r387; add.s32 %r107, %r16, %r327; shr.u32 %r328, %r107, 10; add.s32 %r329, %r328, 1; and.b32 %r386, %r329, 3; setp.eq.s32 %p128, %r386, 0; @%p128 bra $L__BB0_99; ld.param.u64 %rd130, [MultiHeadAttention_sbh_kernel_param_0]; mov.u32 %r387, %tid.x; shl.b32 %r330, %r387, 3; add.s32 %r332, %r228, %r330; add.s32 %r384, %r332, 256; mul.wide.s32 %rd119, %r387, 2; cvt.s64.s32 %rd120, %r128; add.s64 %rd121, %rd119, %rd120; cvta.to.global.u64 %rd122, %rd130; shl.b64 %rd123, %rd121, 1; add.s64 %rd133, %rd122, %rd123; $L__BB0_98: .pragma "nounroll"; ld.shared.v2.f32 {%f421, %f422}, [%r384]; // begin inline asm {.reg .f16 low,high; cvt.rn.f16.f32 low, %f421; cvt.rn.f16.f32 high, %f422; mov.b32 %r337, {low,high};} // end inline asm st.global.u32 [%rd133], %r337; add.s32 %r387, %r387, 1024; add.s32 %r384, %r384, 8192; add.s64 %rd133, %rd133, 4096; add.s32 %r386, %r386, -1; setp.ne.s32 %p129, %r386, 0; @%p129 bra $L__BB0_98; $L__BB0_99: setp.lt.u32 %p130, %r107, 3072; @%p130 bra $L__BB0_102; ld.param.u64 %rd131, [MultiHeadAttention_sbh_kernel_param_0]; mul.wide.s32 %rd124, %r387, 2; cvt.s64.s32 %rd125, %r128; cvta.to.global.u64 %rd126, %rd131; add.s64 %rd127, %rd124, %rd125; shl.b64 %rd128, %rd127, 1; add.s64 %rd129, %rd126, %rd128; add.s64 %rd134, %rd129, 8192; shl.b32 %r342, %r387, 3; add.s32 %r344, %r228, %r342; add.s32 %r388, %r344, 16640; $L__BB0_101: ld.shared.v2.f32 {%f431, %f432}, [%r388+-16384]; // begin inline asm {.reg .f16 low,high; cvt.rn.f16.f32 low, %f431; cvt.rn.f16.f32 high, %f432; mov.b32 %r345, {low,high};} // end inline asm st.global.u32 [%rd134+-8192], %r345; ld.shared.v2.f32 {%f433, %f434}, [%r388+-8192]; // begin inline asm {.reg .f16 low,high; cvt.rn.f16.f32 low, %f433; cvt.rn.f16.f32 high, %f434; mov.b32 %r346, {low,high};} // end inline asm st.global.u32 [%rd134+-4096], %r346; ld.shared.v2.f32 {%f435, %f436}, [%r388]; // begin inline asm {.reg .f16 low,high; cvt.rn.f16.f32 low, %f435; cvt.rn.f16.f32 high, %f436; mov.b32 %r347, {low,high};} // end inline asm st.global.u32 [%rd134], %r347; ld.shared.v2.f32 {%f437, %f438}, [%r388+8192]; // begin inline asm {.reg .f16 low,high; cvt.rn.f16.f32 low, %f437; cvt.rn.f16.f32 high, %f438; mov.b32 %r348, {low,high};} // end inline asm st.global.u32 [%rd134+4096], %r348; add.s64 %rd134, %rd134, 16384; add.s32 %r388, %r388, 32768; add.s32 %r387, %r387, 4096; setp.lt.s32 %p131, %r387, %r16; @%p131 bra $L__BB0_101; $L__BB0_102: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }