scsi_error.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 *  scsi_error.c Copyright (C) 1997 Eric Youngdale
 *
 *  SCSI error/timeout handling
 *      Initial versions: Eric Youngdale.  Based upon conversations with
 *                        Leonard Zubkoff and David Miller at Linux Expo,
 *                        ideas originating from all over the place.
 *
 *	Restructured scsi_unjam_host and associated functions.
 *	September 04, 2002 Mike Anderson ([email protected])
 *
 *	Forward port of Russell King's ([email protected]) changes and
 *	minor cleanups.
 *	September 30, 2002 Mike Anderson ([email protected])
 */

#include <linux/module.h>
#include <linux/sched.h>
#include <linux/gfp.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/interrupt.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/jiffies.h>

#include <scsi/scsi.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_dbg.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_driver.h>
#include <scsi/scsi_eh.h>
#include <scsi/scsi_common.h>
#include <scsi/scsi_transport.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_ioctl.h>
#include <scsi/scsi_dh.h>
#include <scsi/scsi_devinfo.h>
#include <scsi/sg.h>

#include "scsi_priv.h"
#include "scsi_logging.h"
#include "scsi_transport_api.h"

#include <trace/events/scsi.h>

#include <asm/unaligned.h>

/*
 * These should *probably* be handled by the host itself.
 * Since it is allowed to sleep, it probably should.
 */
#define BUS_RESET_SETTLE_TIME …
#define HOST_RESET_SETTLE_TIME …

static int scsi_eh_try_stu(struct scsi_cmnd *scmd);
static enum scsi_disposition scsi_try_to_abort_cmd(const struct scsi_host_template *,
						   struct scsi_cmnd *);

void scsi_eh_wakeup(struct Scsi_Host *shost, unsigned int busy)
{ … }

/**
 * scsi_schedule_eh - schedule EH for SCSI host
 * @shost:	SCSI host to invoke error handling on.
 *
 * Schedule SCSI EH without scmd.
 */
void scsi_schedule_eh(struct Scsi_Host *shost)
{ … }
EXPORT_SYMBOL_GPL(…);

static int scsi_host_eh_past_deadline(struct Scsi_Host *shost)
{ … }

static bool scsi_cmd_retry_allowed(struct scsi_cmnd *cmd)
{ … }

static bool scsi_eh_should_retry_cmd(struct scsi_cmnd *cmd)
{ … }

/**
 * scmd_eh_abort_handler - Handle command aborts
 * @work:	command to be aborted.
 *
 * Note: this function must be called only for a command that has timed out.
 * Because the block layer marks a request as complete before it calls
 * scsi_timeout(), a .scsi_done() call from the LLD for a command that has
 * timed out do not have any effect. Hence it is safe to call
 * scsi_finish_command() from this function.
 */
void
scmd_eh_abort_handler(struct work_struct *work)
{ … }

/**
 * scsi_abort_command - schedule a command abort
 * @scmd:	scmd to abort.
 *
 * We only need to abort commands after a command timeout
 */
static int
scsi_abort_command(struct scsi_cmnd *scmd)
{ … }

/**
 * scsi_eh_reset - call into ->eh_action to reset internal counters
 * @scmd:	scmd to run eh on.
 *
 * The scsi driver might be carrying internal state about the
 * devices, so we need to call into the driver to reset the
 * internal state once the error handler is started.
 */
static void scsi_eh_reset(struct scsi_cmnd *scmd)
{ … }

static void scsi_eh_inc_host_failed(struct rcu_head *head)
{ … }

/**
 * scsi_eh_scmd_add - add scsi cmd to error handling.
 * @scmd:	scmd to run eh on.
 */
void scsi_eh_scmd_add(struct scsi_cmnd *scmd)
{ … }

/**
 * scsi_timeout - Timeout function for normal scsi commands.
 * @req:	request that is timing out.
 *
 * Notes:
 *     We do not need to lock this.  There is the potential for a race
 *     only in that the normal completion handling might run, but if the
 *     normal completion function determines that the timer has already
 *     fired, then it mustn't do anything.
 */
enum blk_eh_timer_return scsi_timeout(struct request *req)
{ … }

/**
 * scsi_block_when_processing_errors - Prevent cmds from being queued.
 * @sdev:	Device on which we are performing recovery.
 *
 * Description:
 *     We block until the host is out of error recovery, and then check to
 *     see whether the host or the device is offline.
 *
 * Return value:
 *     0 when dev was taken offline by error recovery. 1 OK to proceed.
 */
int scsi_block_when_processing_errors(struct scsi_device *sdev)
{ … }
EXPORT_SYMBOL(…);

#ifdef CONFIG_SCSI_LOGGING
/**
 * scsi_eh_prt_fail_stats - Log info on failures.
 * @shost:	scsi host being recovered.
 * @work_q:	Queue of scsi cmds to process.
 */
static inline void scsi_eh_prt_fail_stats(struct Scsi_Host *shost,
					  struct list_head *work_q)
{ … }
#endif

 /**
 * scsi_report_lun_change - Set flag on all *other* devices on the same target
 *                          to indicate that a UNIT ATTENTION is expected.
 * @sdev:	Device reporting the UNIT ATTENTION
 */
static void scsi_report_lun_change(struct scsi_device *sdev)
{ … }

/**
 * scsi_report_sense - Examine scsi sense information and log messages for
 *		       certain conditions, also issue uevents for some of them.
 * @sdev:	Device reporting the sense code
 * @sshdr:	sshdr to be examined
 */
static void scsi_report_sense(struct scsi_device *sdev,
			      struct scsi_sense_hdr *sshdr)
{ … }

static inline void set_scsi_ml_byte(struct scsi_cmnd *cmd, u8 status)
{ … }

/**
 * scsi_check_sense - Examine scsi cmd sense
 * @scmd:	Cmd to have sense checked.
 *
 * Return value:
 *	SUCCESS or FAILED or NEEDS_RETRY or ADD_TO_MLQUEUE
 *
 * Notes:
 *	When a deferred error is detected the current command has
 *	not been executed and needs retrying.
 */
enum scsi_disposition scsi_check_sense(struct scsi_cmnd *scmd)
{ … }
EXPORT_SYMBOL_GPL(…);

static void scsi_handle_queue_ramp_up(struct scsi_device *sdev)
{ … }

static void scsi_handle_queue_full(struct scsi_device *sdev)
{ … }

/**
 * scsi_eh_completed_normally - Disposition a eh cmd on return from LLD.
 * @scmd:	SCSI cmd to examine.
 *
 * Notes:
 *    This is *only* called when we are examining the status of commands
 *    queued during error recovery.  the main difference here is that we
 *    don't allow for the possibility of retries here, and we are a lot
 *    more restrictive about what we consider acceptable.
 */
static enum scsi_disposition scsi_eh_completed_normally(struct scsi_cmnd *scmd)
{ … }

/**
 * scsi_eh_done - Completion function for error handling.
 * @scmd:	Cmd that is done.
 */
void scsi_eh_done(struct scsi_cmnd *scmd)
{ … }

/**
 * scsi_try_host_reset - ask host adapter to reset itself
 * @scmd:	SCSI cmd to send host reset.
 */
static enum scsi_disposition scsi_try_host_reset(struct scsi_cmnd *scmd)
{ … }

/**
 * scsi_try_bus_reset - ask host to perform a bus reset
 * @scmd:	SCSI cmd to send bus reset.
 */
static enum scsi_disposition scsi_try_bus_reset(struct scsi_cmnd *scmd)
{ … }

static void __scsi_report_device_reset(struct scsi_device *sdev, void *data)
{ … }

/**
 * scsi_try_target_reset - Ask host to perform a target reset
 * @scmd:	SCSI cmd used to send a target reset
 *
 * Notes:
 *    There is no timeout for this operation.  if this operation is
 *    unreliable for a given host, then the host itself needs to put a
 *    timer on it, and set the host back to a consistent state prior to
 *    returning.
 */
static enum scsi_disposition scsi_try_target_reset(struct scsi_cmnd *scmd)
{ … }

/**
 * scsi_try_bus_device_reset - Ask host to perform a BDR on a dev
 * @scmd:	SCSI cmd used to send BDR
 *
 * Notes:
 *    There is no timeout for this operation.  if this operation is
 *    unreliable for a given host, then the host itself needs to put a
 *    timer on it, and set the host back to a consistent state prior to
 *    returning.
 */
static enum scsi_disposition scsi_try_bus_device_reset(struct scsi_cmnd *scmd)
{ … }

/**
 * scsi_try_to_abort_cmd - Ask host to abort a SCSI command
 * @hostt:	SCSI driver host template
 * @scmd:	SCSI cmd used to send a target reset
 *
 * Return value:
 *	SUCCESS, FAILED, or FAST_IO_FAIL
 *
 * Notes:
 *    SUCCESS does not necessarily indicate that the command
 *    has been aborted; it only indicates that the LLDDs
 *    has cleared all references to that command.
 *    LLDDs should return FAILED only if an abort was required
 *    but could not be executed. LLDDs should return FAST_IO_FAIL
 *    if the device is temporarily unavailable (eg due to a
 *    link down on FibreChannel)
 */
static enum scsi_disposition
scsi_try_to_abort_cmd(const struct scsi_host_template *hostt, struct scsi_cmnd *scmd)
{ … }

static void scsi_abort_eh_cmnd(struct scsi_cmnd *scmd)
{ … }

/**
 * scsi_eh_prep_cmnd  - Save a scsi command info as part of error recovery
 * @scmd:       SCSI command structure to hijack
 * @ses:        structure to save restore information
 * @cmnd:       CDB to send. Can be NULL if no new cmnd is needed
 * @cmnd_size:  size in bytes of @cmnd (must be <= MAX_COMMAND_SIZE)
 * @sense_bytes: size of sense data to copy. or 0 (if != 0 @cmnd is ignored)
 *
 * This function is used to save a scsi command information before re-execution
 * as part of the error recovery process.  If @sense_bytes is 0 the command
 * sent must be one that does not transfer any data.  If @sense_bytes != 0
 * @cmnd is ignored and this functions sets up a REQUEST_SENSE command
 * and cmnd buffers to read @sense_bytes into @scmd->sense_buffer.
 */
void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses,
			unsigned char *cmnd, int cmnd_size, unsigned sense_bytes)
{ … }
EXPORT_SYMBOL(…);

/**
 * scsi_eh_restore_cmnd  - Restore a scsi command info as part of error recovery
 * @scmd:       SCSI command structure to restore
 * @ses:        saved information from a coresponding call to scsi_eh_prep_cmnd
 *
 * Undo any damage done by above scsi_eh_prep_cmnd().
 */
void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses)
{ … }
EXPORT_SYMBOL(…);

/**
 * scsi_send_eh_cmnd  - submit a scsi command as part of error recovery
 * @scmd:       SCSI command structure to hijack
 * @cmnd:       CDB to send
 * @cmnd_size:  size in bytes of @cmnd
 * @timeout:    timeout for this request
 * @sense_bytes: size of sense data to copy or 0
 *
 * This function is used to send a scsi command down to a target device
 * as part of the error recovery process. See also scsi_eh_prep_cmnd() above.
 *
 * Return value:
 *    SUCCESS or FAILED or NEEDS_RETRY
 */
static enum scsi_disposition scsi_send_eh_cmnd(struct scsi_cmnd *scmd,
	unsigned char *cmnd, int cmnd_size, int timeout, unsigned sense_bytes)
{ … }

/**
 * scsi_request_sense - Request sense data from a particular target.
 * @scmd:	SCSI cmd for request sense.
 *
 * Notes:
 *    Some hosts automatically obtain this information, others require
 *    that we obtain it on our own. This function will *not* return until
 *    the command either times out, or it completes.
 */
static enum scsi_disposition scsi_request_sense(struct scsi_cmnd *scmd)
{ … }

static enum scsi_disposition
scsi_eh_action(struct scsi_cmnd *scmd, enum scsi_disposition rtn)
{ … }

/**
 * scsi_eh_finish_cmd - Handle a cmd that eh is finished with.
 * @scmd:	Original SCSI cmd that eh has finished.
 * @done_q:	Queue for processed commands.
 *
 * Notes:
 *    We don't want to use the normal command completion while we are are
 *    still handling errors - it may cause other commands to be queued,
 *    and that would disturb what we are doing.  Thus we really want to
 *    keep a list of pending commands for final completion, and once we
 *    are ready to leave error handling we handle completion for real.
 */
void scsi_eh_finish_cmd(struct scsi_cmnd *scmd, struct list_head *done_q)
{ … }
EXPORT_SYMBOL(…);

/**
 * scsi_eh_get_sense - Get device sense data.
 * @work_q:	Queue of commands to process.
 * @done_q:	Queue of processed commands.
 *
 * Description:
 *    See if we need to request sense information.  if so, then get it
 *    now, so we have a better idea of what to do.
 *
 * Notes:
 *    This has the unfortunate side effect that if a shost adapter does
 *    not automatically request sense information, we end up shutting
 *    it down before we request it.
 *
 *    All drivers should request sense information internally these days,
 *    so for now all I have to say is tough noogies if you end up in here.
 *
 *    XXX: Long term this code should go away, but that needs an audit of
 *         all LLDDs first.
 */
int scsi_eh_get_sense(struct list_head *work_q,
		      struct list_head *done_q)
{ … }
EXPORT_SYMBOL_GPL(…);

/**
 * scsi_eh_tur - Send TUR to device.
 * @scmd:	&scsi_cmnd to send TUR
 *
 * Return value:
 *    0 - Device is ready. 1 - Device NOT ready.
 */
static int scsi_eh_tur(struct scsi_cmnd *scmd)
{ … }

/**
 * scsi_eh_test_devices - check if devices are responding from error recovery.
 * @cmd_list:	scsi commands in error recovery.
 * @work_q:	queue for commands which still need more error recovery
 * @done_q:	queue for commands which are finished
 * @try_stu:	boolean on if a STU command should be tried in addition to TUR.
 *
 * Decription:
 *    Tests if devices are in a working state.  Commands to devices now in
 *    a working state are sent to the done_q while commands to devices which
 *    are still failing to respond are returned to the work_q for more
 *    processing.
 **/
static int scsi_eh_test_devices(struct list_head *cmd_list,
				struct list_head *work_q,
				struct list_head *done_q, int try_stu)
{ … }

/**
 * scsi_eh_try_stu - Send START_UNIT to device.
 * @scmd:	&scsi_cmnd to send START_UNIT
 *
 * Return value:
 *    0 - Device is ready. 1 - Device NOT ready.
 */
static int scsi_eh_try_stu(struct scsi_cmnd *scmd)
{ … }

 /**
 * scsi_eh_stu - send START_UNIT if needed
 * @shost:	&scsi host being recovered.
 * @work_q:	&list_head for pending commands.
 * @done_q:	&list_head for processed commands.
 *
 * Notes:
 *    If commands are failing due to not ready, initializing command required,
 *	try revalidating the device, which will end up sending a start unit.
 */
static int scsi_eh_stu(struct Scsi_Host *shost,
			      struct list_head *work_q,
			      struct list_head *done_q)
{ … }


/**
 * scsi_eh_bus_device_reset - send bdr if needed
 * @shost:	scsi host being recovered.
 * @work_q:	&list_head for pending commands.
 * @done_q:	&list_head for processed commands.
 *
 * Notes:
 *    Try a bus device reset.  Still, look to see whether we have multiple
 *    devices that are jammed or not - if we have multiple devices, it
 *    makes no sense to try bus_device_reset - we really would need to try
 *    a bus_reset instead.
 */
static int scsi_eh_bus_device_reset(struct Scsi_Host *shost,
				    struct list_head *work_q,
				    struct list_head *done_q)
{ … }

/**
 * scsi_eh_target_reset - send target reset if needed
 * @shost:	scsi host being recovered.
 * @work_q:	&list_head for pending commands.
 * @done_q:	&list_head for processed commands.
 *
 * Notes:
 *    Try a target reset.
 */
static int scsi_eh_target_reset(struct Scsi_Host *shost,
				struct list_head *work_q,
				struct list_head *done_q)
{ … }

/**
 * scsi_eh_bus_reset - send a bus reset
 * @shost:	&scsi host being recovered.
 * @work_q:	&list_head for pending commands.
 * @done_q:	&list_head for processed commands.
 */
static int scsi_eh_bus_reset(struct Scsi_Host *shost,
			     struct list_head *work_q,
			     struct list_head *done_q)
{ … }

/**
 * scsi_eh_host_reset - send a host reset
 * @shost:	host to be reset.
 * @work_q:	&list_head for pending commands.
 * @done_q:	&list_head for processed commands.
 */
static int scsi_eh_host_reset(struct Scsi_Host *shost,
			      struct list_head *work_q,
			      struct list_head *done_q)
{ … }

/**
 * scsi_eh_offline_sdevs - offline scsi devices that fail to recover
 * @work_q:	&list_head for pending commands.
 * @done_q:	&list_head for processed commands.
 */
static void scsi_eh_offline_sdevs(struct list_head *work_q,
				  struct list_head *done_q)
{ … }

/**
 * scsi_noretry_cmd - determine if command should be failed fast
 * @scmd:	SCSI cmd to examine.
 */
bool scsi_noretry_cmd(struct scsi_cmnd *scmd)
{ … }

/**
 * scsi_decide_disposition - Disposition a cmd on return from LLD.
 * @scmd:	SCSI cmd to examine.
 *
 * Notes:
 *    This is *only* called when we are examining the status after sending
 *    out the actual data command.  any commands that are queued for error
 *    recovery (e.g. test_unit_ready) do *not* come through here.
 *
 *    When this routine returns failed, it means the error handler thread
 *    is woken.  In cases where the error code indicates an error that
 *    doesn't require the error handler read (i.e. we don't need to
 *    abort/reset), this function should return SUCCESS.
 */
enum scsi_disposition scsi_decide_disposition(struct scsi_cmnd *scmd)
{ … }

static enum rq_end_io_ret eh_lock_door_done(struct request *req,
					    blk_status_t status)
{ … }

/**
 * scsi_eh_lock_door - Prevent medium removal for the specified device
 * @sdev:	SCSI device to prevent medium removal
 *
 * Locking:
 * 	We must be called from process context.
 *
 * Notes:
 * 	We queue up an asynchronous "ALLOW MEDIUM REMOVAL" request on the
 * 	head of the devices request queue, and continue.
 */
static void scsi_eh_lock_door(struct scsi_device *sdev)
{ … }

/**
 * scsi_restart_operations - restart io operations to the specified host.
 * @shost:	Host we are restarting.
 *
 * Notes:
 *    When we entered the error handler, we blocked all further i/o to
 *    this device.  we need to 'reverse' this process.
 */
static void scsi_restart_operations(struct Scsi_Host *shost)
{ … }

/**
 * scsi_eh_ready_devs - check device ready state and recover if not.
 * @shost:	host to be recovered.
 * @work_q:	&list_head for pending commands.
 * @done_q:	&list_head for processed commands.
 */
void scsi_eh_ready_devs(struct Scsi_Host *shost,
			struct list_head *work_q,
			struct list_head *done_q)
{ … }
EXPORT_SYMBOL_GPL(…);

/**
 * scsi_eh_flush_done_q - finish processed commands or retry them.
 * @done_q:	list_head of processed commands.
 */
void scsi_eh_flush_done_q(struct list_head *done_q)
{ … }
EXPORT_SYMBOL(…);

/**
 * scsi_unjam_host - Attempt to fix a host which has a cmd that failed.
 * @shost:	Host to unjam.
 *
 * Notes:
 *    When we come in here, we *know* that all commands on the bus have
 *    either completed, failed or timed out.  we also know that no further
 *    commands are being sent to the host, so things are relatively quiet
 *    and we have freedom to fiddle with things as we wish.
 *
 *    This is only the *default* implementation.  it is possible for
 *    individual drivers to supply their own version of this function, and
 *    if the maintainer wishes to do this, it is strongly suggested that
 *    this function be taken as a template and modified.  this function
 *    was designed to correctly handle problems for about 95% of the
 *    different cases out there, and it should always provide at least a
 *    reasonable amount of error recovery.
 *
 *    Any command marked 'failed' or 'timeout' must eventually have
 *    scsi_finish_cmd() called for it.  we do all of the retry stuff
 *    here, so when we restart the host after we return it should have an
 *    empty queue.
 */
static void scsi_unjam_host(struct Scsi_Host *shost)
{ … }

/**
 * scsi_error_handler - SCSI error handler thread
 * @data:	Host for which we are running.
 *
 * Notes:
 *    This is the main error handling loop.  This is run as a kernel thread
 *    for every SCSI host and handles all error handling activity.
 */
int scsi_error_handler(void *data)
{ … }

/*
 * Function:    scsi_report_bus_reset()
 *
 * Purpose:     Utility function used by low-level drivers to report that
 *		they have observed a bus reset on the bus being handled.
 *
 * Arguments:   shost       - Host in question
 *		channel     - channel on which reset was observed.
 *
 * Returns:     Nothing
 *
 * Lock status: Host lock must be held.
 *
 * Notes:       This only needs to be called if the reset is one which
 *		originates from an unknown location.  Resets originated
 *		by the mid-level itself don't need to call this, but there
 *		should be no harm.
 *
 *		The main purpose of this is to make sure that a CHECK_CONDITION
 *		is properly treated.
 */
void scsi_report_bus_reset(struct Scsi_Host *shost, int channel)
{ … }
EXPORT_SYMBOL(…);

/*
 * Function:    scsi_report_device_reset()
 *
 * Purpose:     Utility function used by low-level drivers to report that
 *		they have observed a device reset on the device being handled.
 *
 * Arguments:   shost       - Host in question
 *		channel     - channel on which reset was observed
 *		target	    - target on which reset was observed
 *
 * Returns:     Nothing
 *
 * Lock status: Host lock must be held
 *
 * Notes:       This only needs to be called if the reset is one which
 *		originates from an unknown location.  Resets originated
 *		by the mid-level itself don't need to call this, but there
 *		should be no harm.
 *
 *		The main purpose of this is to make sure that a CHECK_CONDITION
 *		is properly treated.
 */
void scsi_report_device_reset(struct Scsi_Host *shost, int channel, int target)
{ … }
EXPORT_SYMBOL(…);

/**
 * scsi_ioctl_reset: explicitly reset a host/bus/target/device
 * @dev:	scsi_device to operate on
 * @arg:	reset type (see sg.h)
 */
int
scsi_ioctl_reset(struct scsi_device *dev, int __user *arg)
{ … }

bool scsi_command_normalize_sense(const struct scsi_cmnd *cmd,
				  struct scsi_sense_hdr *sshdr)
{ … }
EXPORT_SYMBOL(…);

/**
 * scsi_get_sense_info_fld - get information field from sense data (either fixed or descriptor format)
 * @sense_buffer:	byte array of sense data
 * @sb_len:		number of valid bytes in sense_buffer
 * @info_out:		pointer to 64 integer where 8 or 4 byte information
 *			field will be placed if found.
 *
 * Return value:
 *	true if information field found, false if not found.
 */
bool scsi_get_sense_info_fld(const u8 *sense_buffer, int sb_len,
			     u64 *info_out)
{ … }
EXPORT_SYMBOL(…);
linux/drivers/scsi/scsi_error.c