#!/bin/bash
# Standardized test launcher for ROCm MIOpen

TESTSDIR="/usr/libexec/rocm/libmiopen1-tests"

if [ ! -e /dev/kfd ]
then
    echo "/dev/kfd not present, system either lacks AMD GPU or AMDGPU driver is not loaded."
    echo "Skipping tests."
    # Magic number to signal 'skipped'
    exit 77
elif [ "$(id -u)" != "0" ] && [ ! -r /dev/kfd ]
then
    echo "/dev/kfd present but no read permission."
    echo "Skipping tests."
    exit 77
elif [ ! -d "$TESTSDIR" ]
then
    echo "The directory $TESTSDIR does not exist." >&2
    echo "Aborting." >&2
    exit 1
fi

# 16 = testbed failure. Fallback to /tmp if run manually outside of autopkgtest
cd "${AUTOPKGTEST_TMP:-/tmp}" || exit 16

# If autopkgtest called us, setup artifacts and gather system info
if [ -n "$AUTOPKGTEST_ARTIFACTS" ]; then
    # Save gtest output as XML to artifacts for download
    [ -n "${GTEST_OUTPUT:-}" ] || export GTEST_OUTPUT="xml:${AUTOPKGTEST_ARTIFACTS}/"

    # First, gather system info
    mount -t debugfs none /sys/kernel/debug || true
    if [ -d /sys/kernel/debug/dri ]
    then
        for index in $(ls /sys/kernel/debug/dri)
        do
            info="/sys/kernel/debug/dri/$index/amdgpu_firmware_info"
            if [ -f "$info" ]
            then
                # shellcheck disable=SC2024   # we don't need privileged write
                cat "$info" > "$AUTOPKGTEST_ARTIFACTS/amdgpu_firmware_info.$index"
            fi
        done
    else
        echo "Could not read /sys/kernel/debug/dri" >> "$AUTOPKGTEST_ARTIFACTS/firmware.err"
    fi
    # shellcheck disable=SC2024   # we don't need privileged write
    dmesg > "$AUTOPKGTEST_ARTIFACTS/dmesg.before" || true
fi

# Any individual failure is overall failure
EXITCODE=0

# Run the consolidated gtest suite
if [ -x "$TESTSDIR/miopen_gtest" ]; then
    RUNTIME_FILTER=""
    RUNTIME_FILTER+="*SerialRun3D_*:"
    RUNTIME_FILTER+="*GPU_ConvBias*:"
    RUNTIME_FILTER+="*Full/GPU_FusionSetArg_FP16*:"
    RUNTIME_FILTER+="*UnitTestConvSolver*:"
    RUNTIME_FILTER+="*UnitTestImplicit*:"
    RUNTIME_FILTER+="*Full/GPU_GetitemBwd_FP32*:"

    DB_FILTER=""
    DB_FILTER+="*GPU_UnitTestActivationDescriptor*:"
    DB_FILTER+="*Smoke/GPU_bn_infer_fused_spatial_*:"
    DB_FILTER+="*Smoke/GPU_UnitTestConvSolverGemm*:"
    DB_FILTER+="*Smoke/GPU_bn_infer_fused_per_act*:"
    DB_FILTER+="*Full/GPU_Cat_FP32*:"
    DB_FILTER+="*Full/GPU_GLU_*:"
    DB_FILTER+="*Smoke/GPU_Bwd_Mha*:"
    DB_FILTER+="*Smoke/GPU_MhaBwd*:"
    DB_FILTER+="*Unit/GPU_MhaFwd*:"
    DB_FILTER+="*Full/GPU_GroupNorm_*:"
    DB_FILTER+="*Full/GPU_TestActivation_*:"
    DB_FILTER+="*Smoke/GPU_Kthvalue_fwd*:"
    DB_FILTER+="*Full/GPU_LayoutTransposeTest_*:"
    DB_FILTER+="*GPU_Lrn_*:"
    DB_FILTER+="*Smoke/GPU_MultiMarginLoss_*:"
    DB_FILTER+="*Full/GPU_BNActivInfer_*:"
    DB_FILTER+="*GPU_PReLU*:"
    DB_FILTER+="*GPU_Reduce_*:"
    DB_FILTER+="*GPU_ReduceCalculationTest_*:"
    DB_FILTER+="*GPU_ReduceExtremeTest_*:"
    DB_FILTER+="*Full/GPU_RoP*:"
    DB_FILTER+="*Full/GPU_Softmax_*:"
    DB_FILTER+="*Full/GPU_SoftMarginLoss*:"
    DB_FILTER+="*Smoke/GPU_Op4dTensorGenericTest_*:"
    DB_FILTER+="*GPU_TernaryTensorOps_*:"
    DB_FILTER+="*GPU_unaryTensorOps_*:"
    DB_FILTER+="*GPU_VecAddTest*"

    # First run all tests except the failling ones
    MAIN_FILTER="-"
    MAIN_FILTER+="$RUNTIME_FILTER"
    MAIN_FILTER+="$DB_FILTER"
    "$TESTSDIR/miopen_gtest" --gtest_filter="$MAIN_FILTER" || EXITCODE=1

    # Then execute on separate invocations of the test binary each of the tests
    # that have db cleanup issues.
    IFS=':' read -ra DB_TEST_ARRAY <<< "$DB_FILTER"
    for test_pattern in "${DB_TEST_ARRAY[@]}"; do
        if [ -z "$test_pattern" ]; then
            continue
        fi

        echo "=================================================="
        echo "Running isolated DB test: $test_pattern"
        echo "=================================================="
        "$TESTSDIR/miopen_gtest" --gtest_filter="$test_pattern" || EXITCODE=1
    done

else
    echo "Warning: miopen_gtest not found in $TESTSDIR"
fi

# Test with MIOpen driver (Only standard suites that support both fp16 suffix and -t 1 flag)
TEST_SUITES="pool lrn activ softmax bnorm reduce"
for SUITE in ${TEST_SUITES}; do
    ${TESTSDIR}/MIOpenDriver ${SUITE} -V 1 -t 1 || EXITCODE=1
    ${TESTSDIR}/MIOpenDriver "${SUITE}fp16" -V 1 -t 1 || EXITCODE=1
done

# RNN doesn't support the -t flag
${TESTSDIR}/MIOpenDriver rnn -V 1 || EXITCODE=1
${TESTSDIR}/MIOpenDriver rnnfp16 -V 1 || EXITCODE=1

# TensorOp doesn't have an fp16 variant
${TESTSDIR}/MIOpenDriver tensorop -V 1 -t 1 || EXITCODE=1

# CTC
${TESTSDIR}/MIOpenDriver ctc -V 1 -t 1 || EXITCODE=1

# Tests might have generated new messages
if [ -n "$AUTOPKGTEST_ARTIFACTS" ]; then
    # shellcheck disable=SC2024   # we don't need privileged write
    dmesg > "$AUTOPKGTEST_ARTIFACTS/dmesg.after" || true
fi

exit $EXITCODE
