1*4882a593Smuzhiyun#!/bin/sh 2*4882a593Smuzhiyun# SPDX-License-Identifier: GPL-2.0-only 3*4882a593Smuzhiyun 4*4882a593SmuzhiyunKSELFTESTS_SKIP=4 5*4882a593Smuzhiyun 6*4882a593Smuzhiyun. ./eeh-functions.sh 7*4882a593Smuzhiyun 8*4882a593Smuzhiyunif ! eeh_supported ; then 9*4882a593Smuzhiyun echo "EEH not supported on this system, skipping" 10*4882a593Smuzhiyun exit $KSELFTESTS_SKIP; 11*4882a593Smuzhiyunfi 12*4882a593Smuzhiyun 13*4882a593Smuzhiyunif [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \ 14*4882a593Smuzhiyun [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then 15*4882a593Smuzhiyun echo "debugfs EEH testing files are missing. Is debugfs mounted?" 16*4882a593Smuzhiyun exit $KSELFTESTS_SKIP; 17*4882a593Smuzhiyunfi 18*4882a593Smuzhiyun 19*4882a593Smuzhiyunpre_lspci=`mktemp` 20*4882a593Smuzhiyunlspci > $pre_lspci 21*4882a593Smuzhiyun 22*4882a593Smuzhiyun# Bump the max freeze count to something absurd so we don't 23*4882a593Smuzhiyun# trip over it while breaking things. 24*4882a593Smuzhiyunecho 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes 25*4882a593Smuzhiyun 26*4882a593Smuzhiyun# record the devices that we break in here. Assuming everything 27*4882a593Smuzhiyun# goes to plan we should get them back once the recover process 28*4882a593Smuzhiyun# is finished. 29*4882a593Smuzhiyundevices="" 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun# Build up a list of candidate devices. 32*4882a593Smuzhiyunfor dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do 33*4882a593Smuzhiyun # skip bridges since we can't recover them (yet...) 34*4882a593Smuzhiyun if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then 35*4882a593Smuzhiyun echo "$dev, Skipped: bridge" 36*4882a593Smuzhiyun continue; 37*4882a593Smuzhiyun fi 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun # Skip VFs for now since we don't have a reliable way 40*4882a593Smuzhiyun # to break them. 41*4882a593Smuzhiyun if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then 42*4882a593Smuzhiyun echo "$dev, Skipped: virtfn" 43*4882a593Smuzhiyun continue; 44*4882a593Smuzhiyun fi 45*4882a593Smuzhiyun 46*4882a593Smuzhiyun if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then 47*4882a593Smuzhiyun echo "$dev, Skipped: ahci doesn't support recovery" 48*4882a593Smuzhiyun continue 49*4882a593Smuzhiyun fi 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun # Don't inject errosr into an already-frozen PE. This happens with 52*4882a593Smuzhiyun # PEs that contain multiple PCI devices (e.g. multi-function cards) 53*4882a593Smuzhiyun # and injecting new errors during the recovery process will probably 54*4882a593Smuzhiyun # result in the recovery failing and the device being marked as 55*4882a593Smuzhiyun # failed. 56*4882a593Smuzhiyun if ! pe_ok $dev ; then 57*4882a593Smuzhiyun echo "$dev, Skipped: Bad initial PE state" 58*4882a593Smuzhiyun continue; 59*4882a593Smuzhiyun fi 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun echo "$dev, Added" 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun # Add to this list of device to check 64*4882a593Smuzhiyun devices="$devices $dev" 65*4882a593Smuzhiyundone 66*4882a593Smuzhiyun 67*4882a593Smuzhiyundev_count="$(echo $devices | wc -w)" 68*4882a593Smuzhiyunecho "Found ${dev_count} breakable devices..." 69*4882a593Smuzhiyun 70*4882a593Smuzhiyunfailed=0 71*4882a593Smuzhiyunfor dev in $devices ; do 72*4882a593Smuzhiyun echo "Breaking $dev..." 73*4882a593Smuzhiyun 74*4882a593Smuzhiyun if ! pe_ok $dev ; then 75*4882a593Smuzhiyun echo "Skipping $dev, Initial PE state is not ok" 76*4882a593Smuzhiyun failed="$((failed + 1))" 77*4882a593Smuzhiyun continue; 78*4882a593Smuzhiyun fi 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun if ! eeh_one_dev $dev ; then 81*4882a593Smuzhiyun failed="$((failed + 1))" 82*4882a593Smuzhiyun fi 83*4882a593Smuzhiyundone 84*4882a593Smuzhiyun 85*4882a593Smuzhiyunecho "$failed devices failed to recover ($dev_count tested)" 86*4882a593Smuzhiyunlspci | diff -u $pre_lspci - 87*4882a593Smuzhiyunrm -f $pre_lspci 88*4882a593Smuzhiyun 89*4882a593Smuzhiyuntest "$failed" -eq 0 90*4882a593Smuzhiyunexit $? 91