; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}

; CHECK-LABEL: test_tcgen05_cp_64x128_v1
define void @test_tcgen05_cp_64x128_v1(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_64x128_v1(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_64x128_v2
define void @test_tcgen05_cp_64x128_v2(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_64x128_v2(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_32x128
define void @test_tcgen05_cp_32x128(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_32x128(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.32x128b.warpx4 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.32x128b.warpx4 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}


; CHECK-LABEL: test_tcgen05_cp_128x128b
define void @test_tcgen05_cp_128x128b(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_128x128b(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x128b [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x128b [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.128x128b.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.128x128b.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_128x256b
define void @test_tcgen05_cp_128x256b(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_128x256b(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x256b [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x256b [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.128x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.128x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_4x256b
define void @test_tcgen05_cp_4x256b(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_4x256b(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.4x256b [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.4x256b [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.4x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.4x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; With src_fmt as b6x16_p32
; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32
define void @test_tcgen05_cp_128x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32
define void @test_tcgen05_cp_4x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32
define void @test_tcgen05_cp_128x128b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32
define void @test_tcgen05_cp_64x128_v1_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32
define void @test_tcgen05_cp_64x128_v2_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32
define void @test_tcgen05_cp_32x128_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; With src_fmt as b4x16_p64
; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64
define void @test_tcgen05_cp_128x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64
define void @test_tcgen05_cp_4x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64
define void @test_tcgen05_cp_128x128b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64
define void @test_tcgen05_cp_64x128_v1_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64
define void @test_tcgen05_cp_64x128_v2_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}

; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64
define void @test_tcgen05_cp_32x128_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_param_0];
; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_param_1];
; CHECK-NEXT:    tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1;
; CHECK-NEXT:    tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1;
; CHECK-NEXT:    ret;
  call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc)
  call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc)

  ret void
}
