obal .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group32_batch723DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<30>; .reg .b16 %rs<265>; .reg .f32 %f<541>; .reg .b32 %r<258>; .reg .b64 %rd<90>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi32ELi7EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[4480]; ld.param.v2.u32 {%r50, %r51}, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r52, %r53}, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f84, %f85}, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs73, %rs74, %rs75, %rs76}, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd26, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd25, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd24, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd23, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd22, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r257, %tid.y; shl.b32 %r54, %r257, 5; mov.u32 %r55, %tid.x; add.s32 %r256, %r54, %r55; setp.ge.u32 %p1, %r256, %r52; mov.f32 %f513, 0f00000000; mov.f32 %f514, %f513; mov.f32 %f515, %f513; mov.f32 %f516, %f513; mov.f32 %f517, %f513; mov.f32 %f518, %f513; mov.f32 %f519, %f513; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd3, %rd23; mov.u32 %r56, %ctaid.x; mul.lo.s32 %r5, %r53, %r56; $L__BB0_2: mad.lo.s32 %r59, %r52, %r56, %r256; mul.wide.u32 %rd33, %r59, 4; add.s64 %rd28, %rd24, %rd33; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd27, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.u32 %r57, [%rd28], %rd27; // end inline asm shr.u32 %r61, %r55, 2; shl.b32 %r62, %r257, 3; add.s32 %r9, %r62, %r61; add.s32 %r10, %r9, %r5; mul.wide.s32 %rd34, %r10, 2; add.s64 %rd31, %rd26, %rd34; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd30, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs81, [%rd31], %rd30; // end inline asm // begin inline asm { cvt.f32.f16 %f93, %rs81;} // end inline asm shl.b16 %rs264, %rs73, 3; setp.eq.s64 %p2, %rd25, 0; @%p2 bra $L__BB0_4; shr.u32 %r63, %r10, 31; add.s32 %r64, %r10, %r63; shr.s32 %r65, %r64, 1; cvt.s64.s32 %rd38, %r65; add.s64 %rd36, %rd25, %rd38; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd35, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs83, [%rd36], %rd35; // end inline asm cvt.u32.u16 %r66, %rs83; and.b32 %r67, %r66, 255; shl.b32 %r68, %r9, 2; and.b32 %r69, %r68, 4; shr.u32 %r70, %r67, %r69; cvt.u16.u32 %rs84, %r70; and.b16 %rs264, %rs84, 15; $L__BB0_4: shl.b32 %r11, %r256, 3; setp.ge.s32 %p3, %r11, %r50; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs73, 0; mul.wide.s32 %rd39, %r11, 2; add.s64 %rd40, %rd3, %rd39; ld.global.v4.u32 {%r71, %r72, %r73, %r74}, [%rd40]; mul.wide.s32 %rd41, %r50, 2; add.s64 %rd42, %rd40, %rd41; ld.global.v4.u32 {%r75, %r76, %r77, %r78}, [%rd42]; add.s32 %r79, %r50, %r11; add.s32 %r80, %r79, %r50; mul.wide.s32 %rd43, %r80, 2; add.s64 %rd44, %rd3, %rd43; ld.global.v4.u32 {%r81, %r82, %r83, %r84}, [%rd44]; add.s64 %rd45, %rd44, %rd41; ld.global.v4.u32 {%r85, %r86, %r87, %r88}, [%rd45]; add.s64 %rd46, %rd45, %rd41; ld.global.v4.u32 {%r89, %r90, %r91, %r92}, [%rd46]; add.s64 %rd47, %rd46, %rd41; ld.global.v4.u32 {%r93, %r94, %r95, %r96}, [%rd47]; add.s64 %rd48, %rd47, %rd41; ld.global.v4.u32 {%r97, %r98, %r99, %r100}, [%rd48]; shr.u16 %rs86, %rs264, 3; and.b16 %rs87, %rs86, 1; setp.eq.b16 %p5, %rs87, 1; and.pred %p6, %p4, %p5; selp.b16 %rs88, -16, 0, %p6; or.b16 %rs89, %rs88, %rs264; cvt.s16.s8 %rs90, %rs89; cvt.rn.f32.s16 %f9, %rs90; cvt.u16.u32 %rs5, %r57; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs19}, %r71; mov.b32 {%rs9, %rs20}, %r75; mov.b32 {%rs11, %rs21}, %r81; mov.b32 {%rs13, %rs22}, %r85; mov.b32 {%rs14, %rs23}, %r89; mov.b32 {%rs15, %rs24}, %r93; mov.b32 {%rs16, %rs25}, %r97; shr.u32 %r101, %r57, 4; cvt.u16.u32 %rs17, %r101; and.b16 %rs18, %rs17, 15; shr.u32 %r102, %r57, 8; cvt.u16.u32 %rs26, %r102; and.b16 %rs27, %rs26, 15; mov.b32 {%rs28, %rs37}, %r72; mov.b32 {%rs29, %rs38}, %r76; mov.b32 {%rs30, %rs39}, %r82; mov.b32 {%rs31, %rs40}, %r86; mov.b32 {%rs32, %rs41}, %r90; mov.b32 {%rs33, %rs42}, %r94; mov.b32 {%rs34, %rs43}, %r98; shr.u32 %r103, %r57, 12; cvt.u16.u32 %rs35, %r103; and.b16 %rs36, %rs35, 15; shr.u32 %r104, %r57, 16; cvt.u16.u32 %rs44, %r104; and.b16 %rs45, %rs44, 15; mov.b32 {%rs46, %rs55}, %r73; mov.b32 {%rs47, %rs56}, %r77; mov.b32 {%rs48, %rs57}, %r83; mov.b32 {%rs49, %rs58}, %r87; mov.b32 {%rs50, %rs59}, %r91; mov.b32 {%rs51, %rs60}, %r95; mov.b32 {%rs52, %rs61}, %r99; shr.u32 %r105, %r57, 20; cvt.u16.u32 %rs53, %r105; and.b16 %rs54, %rs53, 15; shr.u32 %r106, %r57, 24; cvt.u16.u32 %rs62, %r106; and.b16 %rs63, %rs62, 15; shr.u32 %r107, %r57, 28; cvt.u16.u32 %rs64, %r107; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f150, %rs6; sub.ftz.f32 %f151, %f150, %f9; mul.ftz.f32 %f152, %f93, %f151; // begin inline asm { cvt.f32.f16 %f94, %rs7;} // end inline asm fma.rn.ftz.f32 %f153, %f152, %f94, %f513; // begin inline asm { cvt.f32.f16 %f95, %rs9;} // end inline asm fma.rn.ftz.f32 %f154, %f152, %f95, %f514; // begin inline asm { cvt.f32.f16 %f96, %rs11;} // end inline asm fma.rn.ftz.f32 %f155, %f152, %f96, %f515; // begin inline asm { cvt.f32.f16 %f97, %rs13;} // end inline asm fma.rn.ftz.f32 %f156, %f152, %f97, %f516; // begin inline asm { cvt.f32.f16 %f98, %rs14;} // end inline asm fma.rn.ftz.f32 %f157, %f152, %f98, %f517; // begin inline asm { cvt.f32.f16 %f99, %rs15;} // end inline asm fma.rn.ftz.f32 %f158, %f152, %f99, %f518; // begin inline asm { cvt.f32.f16 %f100, %rs16;} // end inline asm fma.rn.ftz.f32 %f159, %f152, %f100, %f519; cvt.rn.f32.s16 %f160, %rs18; sub.ftz.f32 %f161, %f160, %f9; mul.ftz.f32 %f162, %f93, %f161; // begin inline asm { cvt.f32.f16 %f101, %rs19;} // end inline asm fma.rn.ftz.f32 %f163, %f162, %f101, %f153; // begin inline asm { cvt.f32.f16 %f102, %rs20;} // end inline asm fma.rn.ftz.f32 %f164, %f162, %f102, %f154; // begin inline asm { cvt.f32.f16 %f103, %rs21;} // end inline asm fma.rn.ftz.f32 %f165, %f162, %f103, %f155; // begin inline asm { cvt.f32.f16 %f104, %rs22;} // end inline asm fma.rn.ftz.f32 %f166, %f162, %f104, %f156; // begin inline asm { cvt.f32.f16 %f105, %rs23;} // end inline asm fma.rn.ftz.f32 %f167, %f162, %f105, %f157; // begin inline asm { cvt.f32.f16 %f106, %rs24;} // end inline asm fma.rn.ftz.f32 %f168, %f162, %f106, %f158; // begin inline asm { cvt.f32.f16 %f107, %rs25;} // end inline asm fma.rn.ftz.f32 %f169, %f162, %f107, %f159; cvt.rn.f32.s16 %f170, %rs27; sub.ftz.f32 %f171, %f170, %f9; mul.ftz.f32 %f172, %f93, %f171; // begin inline asm { cvt.f32.f16 %f108, %rs28;} // end inline asm fma.rn.ftz.f32 %f173, %f172, %f108, %f163; // begin inline asm { cvt.f32.f16 %f109, %rs29;} // end inline asm fma.rn.ftz.f32 %f174, %f172, %f109, %f164; // begin inline asm { cvt.f32.f16 %f110, %rs30;} // end inline asm fma.rn.ftz.f32 %f175, %f172, %f110, %f165; // begin inline asm { cvt.f32.f16 %f111, %rs31;} // end inline asm fma.rn.ftz.f32 %f176, %f172, %f111, %f166; // begin inline asm { cvt.f32.f16 %f112, %rs32;} // end inline asm fma.rn.ftz.f32 %f177, %f172, %f112, %f167; // begin inline asm { cvt.f32.f16 %f113, %rs33;} // end inline asm fma.rn.ftz.f32 %f178, %f172, %f113, %f168; // begin inline asm { cvt.f32.f16 %f114, %rs34;} // end inline asm fma.rn.ftz.f32 %f179, %f172, %f114, %f169; cvt.rn.f32.s16 %f180, %rs36; sub.ftz.f32 %f181, %f180, %f9; mul.ftz.f32 %f182, %f93, %f181; // begin inline asm { cvt.f32.f16 %f115, %rs37;} // end inline asm fma.rn.ftz.f32 %f183, %f182, %f115, %f173; // begin inline asm { cvt.f32.f16 %f116, %rs38;} // end inline asm fma.rn.ftz.f32 %f184, %f182, %f116, %f174; // begin inline asm { cvt.f32.f16 %f117, %rs39;} // end inline asm fma.rn.ftz.f32 %f185, %f182, %f117, %f175; // begin inline asm { cvt.f32.f16 %f118, %rs40;} // end inline asm fma.rn.ftz.f32 %f186, %f182, %f118, %f176; // begin inline asm { cvt.f32.f16 %f119, %rs41;} // end inline asm fma.rn.ftz.f32 %f187, %f182, %f119, %f177; // begin inline asm { cvt.f32.f16 %f120, %rs42;} // end inline asm fma.rn.ftz.f32 %f188, %f182, %f120, %f178; // begin inline asm { cvt.f32.f16 %f121, %rs43;} // end inline asm fma.rn.ftz.f32 %f189, %f182, %f121, %f179; cvt.rn.f32.s16 %f190, %rs45; sub.ftz.f32 %f191, %f190, %f9; mul.ftz.f32 %f192, %f93, %f191; // begin inline asm { cvt.f32.f16 %f122, %rs46;} // end inline asm fma.rn.ftz.f32 %f193, %f192, %f122, %f183; // begin inline asm { cvt.f32.f16 %f123, %rs47;} // end inline asm fma.rn.ftz.f32 %f194, %f192, %f123, %f184; // begin inline asm { cvt.f32.f16 %f124, %rs48;} // end inline asm fma.rn.ftz.f32 %f195, %f192, %f124, %f185; // begin inline asm { cvt.f32.f16 %f125, %rs49;} // end inline asm fma.rn.ftz.f32 %f196, %f192, %f125, %f186; // begin inline asm { cvt.f32.f16 %f126, %rs50;} // end inline asm fma.rn.ftz.f32 %f197, %f192, %f126, %f187; // begin inline asm { cvt.f32.f16 %f127, %rs51;} // end inline asm fma.rn.ftz.f32 %f198, %f192, %f127, %f188; // begin inline asm { cvt.f32.f16 %f128, %rs52;} // end inline asm fma.rn.ftz.f32 %f199, %f192, %f128, %f189; cvt.rn.f32.s16 %f200, %rs54; sub.ftz.f32 %f201, %f200, %f9; mul.ftz.f32 %f202, %f93, %f201; // begin inline asm { cvt.f32.f16 %f129, %rs55;} // end inline asm fma.rn.ftz.f32 %f203, %f202, %f129, %f193; // begin inline asm { cvt.f32.f16 %f130, %rs56;} // end inline asm fma.rn.ftz.f32 %f204, %f202, %f130, %f194; // begin inline asm { cvt.f32.f16 %f131, %rs57;} // end inline asm fma.rn.ftz.f32 %f205, %f202, %f131, %f195; // begin inline asm { cvt.f32.f16 %f132, %rs58;} // end inline asm fma.rn.ftz.f32 %f206, %f202, %f132, %f196; // begin inline asm { cvt.f32.f16 %f133, %rs59;} // end inline asm fma.rn.ftz.f32 %f207, %f202, %f133, %f197; // begin inline asm { cvt.f32.f16 %f134, %rs60;} // end inline asm fma.rn.ftz.f32 %f208, %f202, %f134, %f198; // begin inline asm { cvt.f32.f16 %f135, %rs61;} // end inline asm fma.rn.ftz.f32 %f209, %f202, %f135, %f199; cvt.rn.f32.s16 %f210, %rs63; sub.ftz.f32 %f211, %f210, %f9; mul.ftz.f32 %f212, %f93, %f211; mov.b32 {%rs133, %rs140}, %r74; // begin inline asm { cvt.f32.f16 %f136, %rs133;} // end inline asm fma.rn.ftz.f32 %f213, %f212, %f136, %f203; mov.b32 {%rs134, %rs141}, %r78; // begin inline asm { cvt.f32.f16 %f137, %rs134;} // end inline asm fma.rn.ftz.f32 %f214, %f212, %f137, %f204; mov.b32 {%rs135, %rs142}, %r84; // begin inline asm { cvt.f32.f16 %f138, %rs135;} // end inline asm fma.rn.ftz.f32 %f215, %f212, %f138, %f205; mov.b32 {%rs136, %rs143}, %r88; // begin inline asm { cvt.f32.f16 %f139, %rs136;} // end inline asm fma.rn.ftz.f32 %f216, %f212, %f139, %f206; mov.b32 {%rs137, %rs144}, %r92; // begin inline asm { cvt.f32.f16 %f140, %rs137;} // end inline asm fma.rn.ftz.f32 %f217, %f212, %f140, %f207; mov.b32 {%rs138, %rs145}, %r96; // begin inline asm { cvt.f32.f16 %f141, %rs138;} // end inline asm fma.rn.ftz.f32 %f218, %f212, %f141, %f208; mov.b32 {%rs139, %rs146}, %r100; // begin inline asm { cvt.f32.f16 %f142, %rs139;} // end inline asm fma.rn.ftz.f32 %f219, %f212, %f142, %f209; cvt.rn.f32.s16 %f220, %rs64; sub.ftz.f32 %f221, %f220, %f9; mul.ftz.f32 %f222, %f93, %f221; // begin inline asm { cvt.f32.f16 %f143, %rs140;} // end inline asm fma.rn.ftz.f32 %f513, %f222, %f143, %f213; // begin inline asm { cvt.f32.f16 %f144, %rs141;} // end inline asm fma.rn.ftz.f32 %f514, %f222, %f144, %f214; // begin inline asm { cvt.f32.f16 %f145, %rs142;} // end inline asm fma.rn.ftz.f32 %f515, %f222, %f145, %f215; // begin inline asm { cvt.f32.f16 %f146, %rs143;} // end inline asm fma.rn.ftz.f32 %f516, %f222, %f146, %f216; // begin inline asm { cvt.f32.f16 %f147, %rs144;} // end inline asm fma.rn.ftz.f32 %f517, %f222, %f147, %f217; // begin inline asm { cvt.f32.f16 %f148, %rs145;} // end inline asm fma.rn.ftz.f32 %f518, %f222, %f148, %f218; // begin inline asm { cvt.f32.f16 %f149, %rs146;} // end inline asm fma.rn.ftz.f32 %f519, %f222, %f149, %f219; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs203, %rs5, 4; cvt.s16.s8 %rs204, %rs203; shr.s16 %rs205, %rs204, 7; and.b16 %rs206, %rs205, -16; or.b16 %rs207, %rs206, %rs6; cvt.rn.f32.s16 %f279, %rs207; sub.ftz.f32 %f280, %f279, %f9; mul.ftz.f32 %f281, %f93, %f280; cvt.u16.u32 %rs147, %r71; // begin inline asm { cvt.f32.f16 %f223, %rs147;} // end inline asm fma.rn.ftz.f32 %f282, %f281, %f223, %f513; cvt.u16.u32 %rs148, %r75; // begin inline asm { cvt.f32.f16 %f224, %rs148;} // end inline asm fma.rn.ftz.f32 %f283, %f281, %f224, %f514; cvt.u16.u32 %rs149, %r81; // begin inline asm { cvt.f32.f16 %f225, %rs149;} // end inline asm fma.rn.ftz.f32 %f284, %f281, %f225, %f515; // begin inline asm { cvt.f32.f16 %f226, %rs13;} // end inline asm fma.rn.ftz.f32 %f285, %f281, %f226, %f516; // begin inline asm { cvt.f32.f16 %f227, %rs14;} // end inline asm fma.rn.ftz.f32 %f286, %f281, %f227, %f517; // begin inline asm { cvt.f32.f16 %f228, %rs15;} // end inline asm fma.rn.ftz.f32 %f287, %f281, %f228, %f518; // begin inline asm { cvt.f32.f16 %f229, %rs16;} // end inline asm fma.rn.ftz.f32 %f288, %f281, %f229, %f519; shl.b16 %rs208, %rs17, 4; cvt.s16.s8 %rs209, %rs208; shr.s16 %rs210, %rs209, 7; and.b16 %rs211, %rs210, -16; or.b16 %rs212, %rs211, %rs18; cvt.rn.f32.s16 %f289, %rs212; sub.ftz.f32 %f290, %f289, %f9; mul.ftz.f32 %f291, %f93, %f290; // begin inline asm { cvt.f32.f16 %f230, %rs19;} // end inline asm fma.rn.ftz.f32 %f292, %f291, %f230, %f282; // begin inline asm { cvt.f32.f16 %f231, %rs20;} // end inline asm fma.rn.ftz.f32 %f293, %f291, %f231, %f283; // begin inline asm { cvt.f32.f16 %f232, %rs21;} // end inline asm fma.rn.ftz.f32 %f294, %f291, %f232, %f284; // begin inline asm { cvt.f32.f16 %f233, %rs22;} // end inline asm fma.rn.ftz.f32 %f295, %f291, %f233, %f285; // begin inline asm { cvt.f32.f16 %f234, %rs23;} // end inline asm fma.rn.ftz.f32 %f296, %f291, %f234, %f286; // begin inline asm { cvt.f32.f16 %f235, %rs24;} // end inline asm fma.rn.ftz.f32 %f297, %f291, %f235, %f287; // begin inline asm { cvt.f32.f16 %f236, %rs25;} // end inline asm fma.rn.ftz.f32 %f298, %f291, %f236, %f288; shl.b16 %rs213, %rs26, 4; cvt.s16.s8 %rs214, %rs213; shr.s16 %rs215, %rs214, 7; and.b16 %rs216, %rs215, -16; or.b16 %rs217, %rs216, %rs27; cvt.rn.f32.s16 %f299, %rs217; sub.ftz.f32 %f300, %f299, %f9; mul.ftz.f32 %f301, %f93, %f300; // begin inline asm { cvt.f32.f16 %f237, %rs28;} // end inline asm fma.rn.ftz.f32 %f302, %f301, %f237, %f292; // begin inline asm { cvt.f32.f16 %f238, %rs29;} // end inline asm fma.rn.ftz.f32 %f303, %f301, %f238, %f293; // begin inline asm { cvt.f32.f16 %f239, %rs30;} // end inline asm fma.rn.ftz.f32 %f304, %f301, %f239, %f294; // begin inline asm { cvt.f32.f16 %f240, %rs31;} // end inline asm fma.rn.ftz.f32 %f305, %f301, %f240, %f295; // begin inline asm { cvt.f32.f16 %f241, %rs32;} // end inline asm fma.rn.ftz.f32 %f306, %f301, %f241, %f296; // begin inline asm { cvt.f32.f16 %f242, %rs33;} // end inline asm fma.rn.ftz.f32 %f307, %f301, %f242, %f297; // begin inline asm { cvt.f32.f16 %f243, %rs34;} // end inline asm fma.rn.ftz.f32 %f308, %f301, %f243, %f298; shl.b16 %rs218, %rs35, 4; cvt.s16.s8 %rs219, %rs218; shr.s16 %rs220, %rs219, 7; and.b16 %rs221, %rs220, -16; or.b16 %rs222, %rs221, %rs36; cvt.rn.f32.s16 %f309, %rs222; sub.ftz.f32 %f310, %f309, %f9; mul.ftz.f32 %f311, %f93, %f310; // begin inline asm { cvt.f32.f16 %f244, %rs37;} // end inline asm fma.rn.ftz.f32 %f312, %f311, %f244, %f302; // begin inline asm { cvt.f32.f16 %f245, %rs38;} // end inline asm fma.rn.ftz.f32 %f313, %f311, %f245, %f303; // begin inline asm { cvt.f32.f16 %f246, %rs39;} // end inline asm fma.rn.ftz.f32 %f314, %f311, %f246, %f304; // begin inline asm { cvt.f32.f16 %f247, %rs40;} // end inline asm fma.rn.ftz.f32 %f315, %f311, %f247, %f305; // begin inline asm { cvt.f32.f16 %f248, %rs41;} // end inline asm fma.rn.ftz.f32 %f316, %f311, %f248, %f306; // begin inline asm { cvt.f32.f16 %f249, %rs42;} // end inline asm fma.rn.ftz.f32 %f317, %f311, %f249, %f307; // begin inline asm { cvt.f32.f16 %f250, %rs43;} // end inline asm fma.rn.ftz.f32 %f318, %f311, %f250, %f308; shl.b16 %rs223, %rs44, 4; cvt.s16.s8 %rs224, %rs223; shr.s16 %rs225, %rs224, 7; and.b16 %rs226, %rs225, -16; or.b16 %rs227, %rs226, %rs45; cvt.rn.f32.s16 %f319, %rs227; sub.ftz.f32 %f320, %f319, %f9; mul.ftz.f32 %f321, %f93, %f320; // begin inline asm { cvt.f32.f16 %f251, %rs46;} // end inline asm fma.rn.ftz.f32 %f322, %f321, %f251, %f312; // begin inline asm { cvt.f32.f16 %f252, %rs47;} // end inline asm fma.rn.ftz.f32 %f323, %f321, %f252, %f313; // begin inline asm { cvt.f32.f16 %f253, %rs48;} // end inline asm fma.rn.ftz.f32 %f324, %f321, %f253, %f314; // begin inline asm { cvt.f32.f16 %f254, %rs49;} // end inline asm fma.rn.ftz.f32 %f325, %f321, %f254, %f315; // begin inline asm { cvt.f32.f16 %f255, %rs50;} // end inline asm fma.rn.ftz.f32 %f326, %f321, %f255, %f316; // begin inline asm { cvt.f32.f16 %f256, %rs51;} // end inline asm fma.rn.ftz.f32 %f327, %f321, %f256, %f317; // begin inline asm { cvt.f32.f16 %f257, %rs52;} // end inline asm fma.rn.ftz.f32 %f328, %f321, %f257, %f318; shl.b16 %rs228, %rs53, 4; cvt.s16.s8 %rs229, %rs228; shr.s16 %rs230, %rs229, 7; and.b16 %rs231, %rs230, -16; or.b16 %rs232, %rs231, %rs54; cvt.rn.f32.s16 %f329, %rs232; sub.ftz.f32 %f330, %f329, %f9; mul.ftz.f32 %f331, %f93, %f330; // begin inline asm { cvt.f32.f16 %f258, %rs55;} // end inline asm fma.rn.ftz.f32 %f332, %f331, %f258, %f322; // begin inline asm { cvt.f32.f16 %f259, %rs56;} // end inline asm fma.rn.ftz.f32 %f333, %f331, %f259, %f323; // begin inline asm { cvt.f32.f16 %f260, %rs57;} // end inline asm fma.rn.ftz.f32 %f334, %f331, %f260, %f324; // begin inline asm { cvt.f32.f16 %f261, %rs58;} // end inline asm fma.rn.ftz.f32 %f335, %f331, %f261, %f325; // begin inline asm { cvt.f32.f16 %f262, %rs59;} // end inline asm fma.rn.ftz.f32 %f336, %f331, %f262, %f326; // begin inline asm { cvt.f32.f16 %f263, %rs60;} // end inline asm fma.rn.ftz.f32 %f337, %f331, %f263, %f327; // begin inline asm { cvt.f32.f16 %f264, %rs61;} // end inline asm fma.rn.ftz.f32 %f338, %f331, %f264, %f328; shl.b16 %rs233, %rs62, 4; cvt.s16.s8 %rs234, %rs233; shr.s16 %rs235, %rs234, 7; and.b16 %rs236, %rs235, -16; or.b16 %rs237, %rs236, %rs63; cvt.rn.f32.s16 %f339, %rs237; sub.ftz.f32 %f340, %f339, %f9; mul.ftz.f32 %f341, %f93, %f340; mov.b32 {%rs189, %rs196}, %r74; // begin inline asm { cvt.f32.f16 %f265, %rs189;} // end inline asm fma.rn.ftz.f32 %f342, %f341, %f265, %f332; mov.b32 {%rs190, %rs197}, %r78; // begin inline asm { cvt.f32.f16 %f266, %rs190;} // end inline asm fma.rn.ftz.f32 %f343, %f341, %f266, %f333; mov.b32 {%rs191, %rs198}, %r84; // begin inline asm { cvt.f32.f16 %f267, %rs191;} // end inline asm fma.rn.ftz.f32 %f344, %f341, %f267, %f334; mov.b32 {%rs192, %rs199}, %r88; // begin inline asm { cvt.f32.f16 %f268, %rs192;} // end inline asm fma.rn.ftz.f32 %f345, %f341, %f268, %f335; mov.b32 {%rs193, %rs200}, %r92; // begin inline asm { cvt.f32.f16 %f269, %rs193;} // end inline asm fma.rn.ftz.f32 %f346, %f341, %f269, %f336; mov.b32 {%rs194, %rs201}, %r96; // begin inline asm { cvt.f32.f16 %f270, %rs194;} // end inline asm fma.rn.ftz.f32 %f347, %f341, %f270, %f337; mov.b32 {%rs195, %rs202}, %r100; // begin inline asm { cvt.f32.f16 %f271, %rs195;} // end inline asm fma.rn.ftz.f32 %f348, %f341, %f271, %f338; shl.b16 %rs238, %rs64, 4; cvt.s16.s8 %rs239, %rs238; shr.s16 %rs240, %rs239, 7; and.b16 %rs241, %rs240, -16; or.b16 %rs242, %rs241, %rs64; cvt.rn.f32.s16 %f349, %rs242; sub.ftz.f32 %f350, %f349, %f9; mul.ftz.f32 %f351, %f93, %f350; // begin inline asm { cvt.f32.f16 %f272, %rs196;} // end inline asm fma.rn.ftz.f32 %f513, %f351, %f272, %f342; // begin inline asm { cvt.f32.f16 %f273, %rs197;} // end inline asm fma.rn.ftz.f32 %f514, %f351, %f273, %f343; // begin inline asm { cvt.f32.f16 %f274, %rs198;} // end inline asm fma.rn.ftz.f32 %f515, %f351, %f274, %f344; // begin inline asm { cvt.f32.f16 %f275, %rs199;} // end inline asm fma.rn.ftz.f32 %f516, %f351, %f275, %f345; // begin inline asm { cvt.f32.f16 %f276, %rs200;} // end inline asm fma.rn.ftz.f32 %f517, %f351, %f276, %f346; // begin inline asm { cvt.f32.f16 %f277, %rs201;} // end inline asm fma.rn.ftz.f32 %f518, %f351, %f277, %f347; // begin inline asm { cvt.f32.f16 %f278, %rs202;} // end inline asm fma.rn.ftz.f32 %f519, %f351, %f278, %f348; $L__BB0_8: add.s32 %r257, %r257, 4; shl.b32 %r108, %r257, 5; add.s32 %r256, %r108, %r55; setp.lt.u32 %p7, %r256, %r52; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r251, %tid.y; mov.u32 %r250, %tid.x; shl.b32 %r249, %r251, 5; add.s32 %r248, %r249, %r250; shl.b32 %r110, %r248, 2; mov.u32 %r111, _ZZ9gemv_int4ILi4ELi32ELi7EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r112, %r111, %r110; setp.lt.u32 %p8, %r248, 32; @%p8 bra $L__BB0_11; add.s32 %r241, %r112, -112; st.shared.f32 [%r241], %f513; $L__BB0_11: mov.u32 %r255, %tid.y; mov.u32 %r254, %tid.x; shl.b32 %r253, %r255, 5; add.s32 %r252, %r253, %r254; setp.gt.u32 %p9, %r252, 31; bar.sync 0; mad.lo.s32 %r43, %r252, 12, %r111; @%p9 bra $L__BB0_13; mov.u32 %r127, 16; ld.shared.f32 %f367, [%r43+16]; add.ftz.f32 %f368, %f513, %f367; ld.shared.f32 %f369, [%r43+20]; add.ftz.f32 %f370, %f368, %f369; ld.shared.f32 %f371, [%r43+24]; add.ftz.f32 %f354, %f370, %f371; mov.u32 %r115, 1; mov.u32 %r128, 31; mov.u32 %r129, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f354, %r115, %r128, %r129; @p add.f32 r0, r0, %f354; mov.f32 %f352, r0;} // end inline asm mov.u32 %r118, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f352, %r118, %r128, %r129; @p add.f32 r0, r0, %f352; mov.f32 %f355, r0;} // end inline asm mov.u32 %r121, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f355, %r121, %r128, %r129; @p add.f32 r0, r0, %f355; mov.f32 %f358, r0;} // end inline asm mov.u32 %r124, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f358, %r124, %r128, %r129; @p add.f32 r0, r0, %f358; mov.f32 %f361, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f361, %r127, %r128, %r129; @p add.f32 r0, r0, %f361; mov.f32 %f513, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r242, %r112, -112; st.shared.f32 [%r242+640], %f514; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f387, [%r43+656]; add.ftz.f32 %f388, %f514, %f387; ld.shared.f32 %f389, [%r43+660]; add.ftz.f32 %f390, %f388, %f389; ld.shared.f32 %f391, [%r43+664]; add.ftz.f32 %f374, %f390, %f391; mov.u32 %r131, 1; mov.u32 %r144, 31; mov.u32 %r145, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f374, %r131, %r144, %r145; @p add.f32 r0, r0, %f374; mov.f32 %f372, r0;} // end inline asm mov.u32 %r134, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f372, %r134, %r144, %r145; @p add.f32 r0, r0, %f372; mov.f32 %f375, r0;} // end inline asm mov.u32 %r137, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f375, %r137, %r144, %r145; @p add.f32 r0, r0, %f375; mov.f32 %f378, r0;} // end inline asm mov.u32 %r140, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f378, %r140, %r144, %r145; @p add.f32 r0, r0, %f378; mov.f32 %f381, r0;} // end inline asm mov.u32 %r143, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f381, %r143, %r144, %r145; @p add.f32 r0, r0, %f381; mov.f32 %f514, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r243, %r112, -112; st.shared.f32 [%r243+1280], %f515; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f407, [%r43+1296]; add.ftz.f32 %f408, %f515, %f407; ld.shared.f32 %f409, [%r43+1300]; add.ftz.f32 %f410, %f408, %f409; ld.shared.f32 %f411, [%r43+1304]; add.ftz.f32 %f394, %f410, %f411; mov.u32 %r147, 1; mov.u32 %r160, 31; mov.u32 %r161, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f394, %r147, %r160, %r161; @p add.f32 r0, r0, %f394; mov.f32 %f392, r0;} // end inline asm mov.u32 %r150, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f392, %r150, %r160, %r161; @p add.f32 r0, r0, %f392; mov.f32 %f395, r0;} // end inline asm mov.u32 %r153, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f395, %r153, %r160, %r161; @p add.f32 r0, r0, %f395; mov.f32 %f398, r0;} // end inline asm mov.u32 %r156, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f398, %r156, %r160, %r161; @p add.f32 r0, r0, %f398; mov.f32 %f401, r0;} // end inline asm mov.u32 %r159, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f401, %r159, %r160, %r161; @p add.f32 r0, r0, %f401; mov.f32 %f515, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r244, %r112, -112; st.shared.f32 [%r244+1920], %f516; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f427, [%r43+1936]; add.ftz.f32 %f428, %f516, %f427; ld.shared.f32 %f429, [%r43+1940]; add.ftz.f32 %f430, %f428, %f429; ld.shared.f32 %f431, [%r43+1944]; add.ftz.f32 %f414, %f430, %f431; mov.u32 %r163, 1; mov.u32 %r176, 31; mov.u32 %r177, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f414, %r163, %r176, %r177; @p add.f32 r0, r0, %f414; mov.f32 %f412, r0;} // end inline asm mov.u32 %r166, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f412, %r166, %r176, %r177; @p add.f32 r0, r0, %f412; mov.f32 %f415, r0;} // end inline asm mov.u32 %r169, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f415, %r169, %r176, %r177; @p add.f32 r0, r0, %f415; mov.f32 %f418, r0;} // end inline asm mov.u32 %r172, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f418, %r172, %r176, %r177; @p add.f32 r0, r0, %f418; mov.f32 %f421, r0;} // end inline asm mov.u32 %r175, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f421, %r175, %r176, %r177; @p add.f32 r0, r0, %f421; mov.f32 %f516, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r245, %r112, -112; st.shared.f32 [%r245+2560], %f517; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f447, [%r43+2576]; add.ftz.f32 %f448, %f517, %f447; ld.shared.f32 %f449, [%r43+2580]; add.ftz.f32 %f450, %f448, %f449; ld.shared.f32 %f451, [%r43+2584]; add.ftz.f32 %f434, %f450, %f451; mov.u32 %r179, 1; mov.u32 %r192, 31; mov.u32 %r193, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f434, %r179, %r192, %r193; @p add.f32 r0, r0, %f434; mov.f32 %f432, r0;} // end inline asm mov.u32 %r182, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f432, %r182, %r192, %r193; @p add.f32 r0, r0, %f432; mov.f32 %f435, r0;} // end inline asm mov.u32 %r185, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f435, %r185, %r192, %r193; @p add.f32 r0, r0, %f435; mov.f32 %f438, r0;} // end inline asm mov.u32 %r188, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f438, %r188, %r192, %r193; @p add.f32 r0, r0, %f438; mov.f32 %f441, r0;} // end inline asm mov.u32 %r191, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f441, %r191, %r192, %r193; @p add.f32 r0, r0, %f441; mov.f32 %f517, r0;} // end inline asm $L__BB0_29: @%p8 bra $L__BB0_31; add.s32 %r246, %r112, -112; st.shared.f32 [%r246+3200], %f518; $L__BB0_31: bar.sync 0; @%p9 bra $L__BB0_33; ld.shared.f32 %f467, [%r43+3216]; add.ftz.f32 %f468, %f518, %f467; ld.shared.f32 %f469, [%r43+3220]; add.ftz.f32 %f470, %f468, %f469; ld.shared.f32 %f471, [%r43+3224]; add.ftz.f32 %f454, %f470, %f471; mov.u32 %r195, 1; mov.u32 %r208, 31; mov.u32 %r209, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f454, %r195, %r208, %r209; @p add.f32 r0, r0, %f454; mov.f32 %f452, r0;} // end inline asm mov.u32 %r198, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f452, %r198, %r208, %r209; @p add.f32 r0, r0, %f452; mov.f32 %f455, r0;} // end inline asm mov.u32 %r201, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f455, %r201, %r208, %r209; @p add.f32 r0, r0, %f455; mov.f32 %f458, r0;} // end inline asm mov.u32 %r204, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f458, %r204, %r208, %r209; @p add.f32 r0, r0, %f458; mov.f32 %f461, r0;} // end inline asm mov.u32 %r207, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f461, %r207, %r208, %r209; @p add.f32 r0, r0, %f461; mov.f32 %f518, r0;} // end inline asm $L__BB0_33: @%p8 bra $L__BB0_35; add.s32 %r247, %r112, -112; st.shared.f32 [%r247+3840], %f519; $L__BB0_35: bar.sync 0; @%p9 bra $L__BB0_37; ld.shared.f32 %f487, [%r43+3856]; add.ftz.f32 %f488, %f519, %f487; ld.shared.f32 %f489, [%r43+3860]; add.ftz.f32 %f490, %f488, %f489; ld.shared.f32 %f491, [%r43+3864]; add.ftz.f32 %f474, %f490, %f491; mov.u32 %r211, 1; mov.u32 %r224, 31; mov.u32 %r225, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f474, %r211, %r224, %r225; @p add.f32 r0, r0, %f474; mov.f32 %f472, r0;} // end inline asm mov.u32 %r214, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f472, %r214, %r224, %r225; @p add.f32 r0, r0, %f472; mov.f32 %f475, r0;} // end inline asm mov.u32 %r217, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f475, %r217, %r224, %r225; @p add.f32 r0, r0, %f475; mov.f32 %f478, r0;} // end inline asm mov.u32 %r220, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f478, %r220, %r224, %r225; @p add.f32 r0, r0, %f478; mov.f32 %f481, r0;} // end inline asm mov.u32 %r223, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f481, %r223, %r224, %r225; @p add.f32 r0, r0, %f481; mov.f32 %f519, r0;} // end inline asm $L__BB0_37: mov.u32 %r227, %tid.y; or.b32 %r228, %r55, %r227; setp.ne.s32 %p22, %r228, 0; @%p22 bra $L__BB0_53; ld.param.u64 %rd87, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+8]; setp.eq.s64 %p23, %rd87, 0; mul.ftz.f32 %f534, %f84, %f513; mov.u32 %r229, %ctaid.x; cvt.s64.s32 %rd7, %r229; @%p23 bra $L__BB0_40; shl.b64 %rd49, %rd7, 1; add.s64 %rd50, %rd2, %rd49; ld.global.u16 %rs243, [%rd50]; // begin inline asm { cvt.f32.f16 %f492, %rs243;} // end inline asm fma.rn.ftz.f32 %f534, %f85, %f492, %f534; $L__BB0_40: ld.param.u64 %rd88, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs244, %f534;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd51, 1.0; // end inline asm shl.b64 %rd54, %rd7, 1; add.s64 %rd52, %rd88, %rd54; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd52], %rs244, %rd51; // end inline asm mul.ftz.f32 %f535, %f84, %f514; add.s32 %r231, %r51, %r229; cvt.s64.s32 %rd10, %r231; @%p23 bra $L__BB0_42; shl.b64 %rd55, %rd10, 1; add.s64 %rd56, %rd2, %rd55; ld.global.u16 %rs246, [%rd56]; // begin inline asm { cvt.f32.f16 %f494, %rs246;} // end inline asm fma.rn.ftz.f32 %f535, %f85, %f494, %f535; $L__BB0_42: cvt.s64.s32 %rd11, %r51; mul.wide.s32 %rd60, %r51, 2; add.s64 %rd58, %rd52, %rd60; // begin inline asm { cvt.rn.f16.f32 %rs247, %f535;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd57, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd58], %rs247, %rd57; // end inline asm mul.ftz.f32 %f536, %f84, %f515; cvt.u32.u64 %r232, %rd10; add.s32 %r233, %r232, %r51; cvt.s64.s32 %rd12, %r233; @%p23 bra $L__BB0_44; shl.b64 %rd61, %rd12, 1; add.s64 %rd62, %rd2, %rd61; ld.global.u16 %rs249, [%rd62]; // begin inline asm { cvt.f32.f16 %f496, %rs249;} // end inline asm fma.rn.ftz.f32 %f536, %f85, %f496, %f536; $L__BB0_44: ld.param.u64 %rd89, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0]; shl.b64 %rd66, %rd12, 1; add.s64 %rd64, %rd89, %rd66; // begin inline asm { cvt.rn.f16.f32 %rs250, %f536;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd63, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd64], %rs250, %rd63; // end inline asm mul.ftz.f32 %f537, %f84, %f516; cvt.u32.u64 %r234, %rd12; add.s32 %r235, %r234, %r51; cvt.s64.s32 %rd14, %r235; @%p23 bra $L__BB0_46; shl.b64 %rd67, %rd14, 1; add.s64 %rd68, %rd2, %rd67; ld.global.u16 %rs252, [%rd68]; // begin inline asm { cvt.f32.f16 %f498, %rs252;} // end inline asm fma.rn.ftz.f32 %f537, %f85, %f498, %f537; $L__BB0_46: // begin inline asm { cvt.rn.f16.f32 %rs253, %f537;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd69, 1.0; // end inline asm shl.b64 %rd15, %rd11, 1; add.s64 %rd70, %rd64, %rd15; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd70], %rs253, %rd69; // end inline asm mul.ftz.f32 %f538, %f84, %f517; cvt.u32.u64 %r236, %rd14; add.s32 %r237, %r236, %r51; cvt.s64.s32 %rd17, %r237; @%p23 bra $L__BB0_48; shl.b64 %rd72, %rd17, 1; add.s64 %rd73, %rd2, %rd72; ld.global.u16 %rs255, [%rd73]; // begin inline asm { cvt.f32.f16 %f500, %rs255;} // end inline asm fma.rn.ftz.f32 %f538, %f85, %f500, %f538; $L__BB0_48: // begin inline asm { cvt.rn.f16.f32 %rs256, %f538;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd74, 1.0; // end inline asm add.s64 %rd75, %rd70, %rd15; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd75], %rs256, %rd74; // end inline asm mul.ftz.f32 %f539, %f84, %f518; cvt.u32.u64 %r238, %rd17; add.s32 %r239, %r238, %r51; cvt.s64.s32 %rd19, %r239; @%p23 bra $L__BB0_50; shl.b64 %rd77, %rd19, 1; add.s64 %rd78, %rd2, %rd77; ld.global.u16 %rs258, [%rd78]; // begin inline asm { cvt.f32.f16 %f502, %rs258;} // end inline asm fma.rn.ftz.f32 %f539, %f85, %f502, %f539; $L__BB0_50: // begin inline asm { cvt.rn.f16.f32 %rs259, %f539;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd79, 1.0; // end inline asm add.s64 %rd80, %rd75, %rd15; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd80], %rs259, %rd79; // end inline asm mul.ftz.f32 %f540, %f84, %f519; cvt.u32.u64 %r240, %rd19; add.s32 %r45, %r240, %r51; @%p23 bra $L__BB0_52; mul.wide.s32 %rd82, %r45, 2; add.s64 %rd83, %rd2, %rd82; ld.global.u16 %rs261, [%rd83]; // begin inline asm { cvt.f32.f16 %f504, %rs261;} // end inline asm fma.rn.ftz.f32 %f540, %f85, %f504, %f540; $L__BB0_52: // begin inline asm { cvt.rn.f16.f32 %rs262, %f540;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd84, 1.0; // end inline asm add.s64 %rd85, %rd80, %rd15; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd85], %rs262, %rd84; // end inline asm $L__BB0_53: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }