scsi: qla2xxx: target: Add back SRR support

Background: loading qla2xxx with "ql2xtgt_tape_enable=1" enables
Sequence Level Error Recovery (SLER), which is most commonly used for
tape drives.  With SLER enabled, if there is a recoverable I/O error
during a SCSI command, a Sequence Retransmission Request (SRR) will be
used to retry the I/O at a low-level completely within the driver
without propagating the error to the upper levels of the SCSI stack.

SRR support was removed in 2017 by commit 2c39b5ca2a ("qla2xxx: Remove
SRR code"). Add it back, new and improved.

The old removed SRR code used sequence numbers to correlate the SRR
CTIOs with SRR immediate notify messages.  I don't see how that would
work reliably with MSI-X interrupts and multiple queues.  So instead use
the exchange address to find the command associated with the immediate
notify (qlt_srr_to_cmd).

The old removed SRR code had a function qlt_check_srr_debug() to
simulate a SRR, but it didn't work for me.  Instead I just used fiber
optic attenuators attached to the FC cable to reduce the strength of the
signal and induce errors.  Unfortunately this only worked for inducing
SRRs on Data-Out (write) commands, so that is all I was able to test.

The code to build a new scatterlist for a SRR with nonzero offset has
been improved to reduce memory requirements and has been well-tested.
However it does not support protection information.

When a single cmd gets multiple SRRs, the old removed SRR code would
restore the data buffer from the values in cmd->se_cmd before processing
the new SRR.  That might be needed if the offset for the new SRR was
lower than the offset for the previous SRR, but I am not sure if that
can happen.  In my testing, when a single cmd gets multiple SRRs, the
SRR offset always increases or stays the same.  But in case it can
decrease, I added the function qlt_restore_orig_sg().  If this is not
supposed to happen then qlt_restore_orig_sg() can be removed to simplify
the code.

I ran into some HBA firmware bugs with QLE269x, QLE27xx, and QLE28xx
firmware 9.05.xx - 9.08.xx where a SRR would cause the HBA to misbehave
badly.  Since SRRs are rare and therefore difficult to test, I figured
it would be worth checking for the buggy firmware and disabling SLER
with a warning instead of letting others run into the same problem on
the rare occasion that they get a SRR.  This turned out to be difficult
because the firmware version isn't known in the normal NVRAM config
routine, so I added a second NVRAM config routine that is called after
the firmware version is known.

Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
Link: https://patch.msgid.link/654b7181-b79e-40ed-a15b-6d6e441a5d5f@cybernetics.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
This commit is contained in:
Tony Battersby
2025-11-10 11:04:42 -05:00
committed by Martin K. Petersen
parent 04957d8c98
commit c7bd85a7b9
5 changed files with 1141 additions and 1 deletions

View File

@@ -58,6 +58,7 @@
* | Target Mode Management | 0xf09b | 0xf002 |
* | | | 0xf046-0xf049 |
* | Target Mode Task Management | 0x1000d | |
* | Target Mode SRR | 0x11038 | |
* ----------------------------------------------------------------------
*/

View File

@@ -4369,6 +4369,7 @@ enable_82xx_npiv:
ha->max_npiv_vports =
MIN_MULTI_ID_FABRIC - 1;
}
qlt_config_nvram_with_fw_version(vha);
qla2x00_get_resource_cnts(vha);
qla_init_iocb_limit(vha);

File diff suppressed because it is too large Load Diff

View File

@@ -184,6 +184,7 @@ struct nack_to_isp {
#define NOTIFY_ACK_SRR_REJECT_REASON_UNABLE_TO_PERFORM 0x9
#define NOTIFY_ACK_SRR_FLAGS_REJECT_EXPL_NO_EXPL 0
#define NOTIFY_ACK_SRR_FLAGS_REJECT_EXPL_INVALID_OX_ID_RX_ID 0x17
#define NOTIFY_ACK_SRR_FLAGS_REJECT_EXPL_UNABLE_TO_SUPPLY_DATA 0x2a
#define NOTIFY_ACK_SUCCESS 0x01
@@ -686,6 +687,8 @@ struct qla_tgt_func_tmpl {
int (*handle_tmr)(struct qla_tgt_mgmt_cmd *, u64, uint16_t,
uint32_t);
struct qla_tgt_cmd *(*get_cmd)(struct fc_port *);
int (*get_cmd_ref)(struct qla_tgt_cmd *cmd);
void (*put_cmd_ref)(struct qla_tgt_cmd *cmd);
void (*rel_cmd)(struct qla_tgt_cmd *);
void (*free_cmd)(struct qla_tgt_cmd *);
void (*free_mcmd)(struct qla_tgt_mgmt_cmd *);
@@ -823,7 +826,13 @@ struct qla_tgt {
int notify_ack_expected;
int abts_resp_expected;
int modify_lun_expected;
spinlock_t srr_lock;
struct list_head srr_list;
struct work_struct srr_work;
atomic_t tgt_global_resets_count;
struct list_head tgt_list_entry;
};
@@ -861,6 +870,7 @@ enum trace_flags {
TRC_DATA_IN = BIT_18,
TRC_ABORT = BIT_19,
TRC_DIF_ERR = BIT_20,
TRC_SRR_IMM = BIT_21,
};
struct qla_tgt_cmd {
@@ -881,6 +891,10 @@ struct qla_tgt_cmd {
unsigned int conf_compl_supported:1;
unsigned int sg_mapped:1;
/* Call qlt_free_sg() if set. */
unsigned int free_sg:1;
unsigned int write_data_transferred:1;
/* Set if the SCSI status was sent successfully. */
@@ -892,6 +906,9 @@ struct qla_tgt_cmd {
unsigned int cmd_in_wq:1;
unsigned int edif:1;
/* Set if a SRR was rejected. */
unsigned int srr_failed:1;
/* Set if the exchange has been terminated. */
unsigned int sent_term_exchg:1;
@@ -901,6 +918,7 @@ struct qla_tgt_cmd {
*/
unsigned int aborted:1;
struct qla_tgt_srr *srr;
struct scatterlist *sg; /* cmd data buffer SG vector */
int sg_cnt; /* SG segments count */
int bufflen; /* cmd buffer length */
@@ -940,6 +958,14 @@ struct qla_tgt_cmd {
uint16_t prot_flags;
unsigned long jiffies_at_term_exchg;
/*
* jiffies64 when qlt_rdy_to_xfer() or qlt_xmit_response() first
* called, or 0 when not in those states. Used to limit the number of
* SRR retries.
*/
uint64_t jiffies_at_hw_st_entry;
uint64_t jiffies_at_alloc;
uint64_t jiffies_at_free;
@@ -1002,6 +1028,45 @@ struct qla_tgt_prm {
uint16_t tot_dsds;
};
/*
* SRR (Sequence Retransmission Request) - resend or re-receive some or all
* data or status to recover from a transient I/O error.
*/
struct qla_tgt_srr {
/*
* Copy of immediate notify SRR message received from hw; valid only if
* imm_ntfy_recvd is true.
*/
struct imm_ntfy_from_isp imm_ntfy;
struct list_head srr_list_entry;
/* The command affected by this SRR, or NULL if not yet determined. */
struct qla_tgt_cmd *cmd;
/* Used to detect if the HBA has been reset since receiving the SRR. */
uint32_t reset_count;
/*
* The hardware sends two messages for each SRR - an immediate notify
* and a CTIO with CTIO_SRR_RECEIVED status. These keep track of which
* messages have been received. The SRR can be processed once both of
* these are true.
*/
bool imm_ntfy_recvd;
bool ctio_recvd;
/*
* This is set to true if the affected command was aborted (cmd may be
* set to NULL), in which case the immediate notify exchange also needs
* to be aborted.
*/
bool aborted;
/* This is set to true to force the SRR to be rejected. */
bool reject;
};
/* Check for Switch reserved address */
#define IS_SW_RESV_ADDR(_s_id) \
((_s_id.b.domain == 0xff) && ((_s_id.b.area & 0xf0) == 0xf0))
@@ -1056,6 +1121,20 @@ static inline uint32_t sid_to_key(const be_id_t s_id)
s_id.al_pa;
}
/*
* Free the scatterlist allocated by qlt_set_data_offset(). Call this only if
* cmd->free_sg is set.
*/
static inline void qlt_free_sg(struct qla_tgt_cmd *cmd)
{
/*
* The scatterlist may be chained to the original scatterlist, but we
* only need to free the first segment here since that is the only part
* allocated by qlt_set_data_offset().
*/
kfree(cmd->sg);
}
/*
* Exported symbols from qla_target.c LLD logic used by qla2xxx code..
*/
@@ -1064,6 +1143,7 @@ extern void qlt_response_pkt_all_vps(struct scsi_qla_host *, struct rsp_que *,
extern int qlt_rdy_to_xfer(struct qla_tgt_cmd *);
extern int qlt_xmit_response(struct qla_tgt_cmd *, int, uint8_t);
extern int qlt_abort_cmd(struct qla_tgt_cmd *);
void qlt_srr_abort(struct qla_tgt_cmd *cmd, bool reject);
void qlt_send_term_exchange(struct qla_qpair *qpair,
struct qla_tgt_cmd *cmd, struct atio_from_isp *atio, int ha_locked);
extern void qlt_xmit_tm_rsp(struct qla_tgt_mgmt_cmd *);
@@ -1086,6 +1166,7 @@ extern void qlt_81xx_config_nvram_stage2(struct scsi_qla_host *,
struct init_cb_81xx *);
extern void qlt_81xx_config_nvram_stage1(struct scsi_qla_host *,
struct nvram_81xx *);
void qlt_config_nvram_with_fw_version(struct scsi_qla_host *vha);
extern void qlt_modify_vp_config(struct scsi_qla_host *,
struct vp_config_entry_24xx *);
extern void qlt_probe_one_stage1(struct scsi_qla_host *, struct qla_hw_data *);

View File

@@ -291,6 +291,16 @@ static struct qla_tgt_cmd *tcm_qla2xxx_get_cmd(struct fc_port *sess)
return cmd;
}
static int tcm_qla2xxx_get_cmd_ref(struct qla_tgt_cmd *cmd)
{
return target_get_sess_cmd(&cmd->se_cmd, true);
}
static void tcm_qla2xxx_put_cmd_ref(struct qla_tgt_cmd *cmd)
{
target_put_sess_cmd(&cmd->se_cmd);
}
static void tcm_qla2xxx_rel_cmd(struct qla_tgt_cmd *cmd)
{
target_free_tag(cmd->sess->se_sess, &cmd->se_cmd);
@@ -531,6 +541,9 @@ static void tcm_qla2xxx_handle_data_work(struct work_struct *work)
if (cmd->se_cmd.pi_err)
transport_generic_request_failure(&cmd->se_cmd,
cmd->se_cmd.pi_err);
else if (cmd->srr_failed)
transport_generic_request_failure(&cmd->se_cmd,
TCM_SNACK_REJECTED);
else
transport_generic_request_failure(&cmd->se_cmd,
TCM_CHECK_CONDITION_ABORT_CMD);
@@ -1526,6 +1539,8 @@ static const struct qla_tgt_func_tmpl tcm_qla2xxx_template = {
.handle_data = tcm_qla2xxx_handle_data,
.handle_tmr = tcm_qla2xxx_handle_tmr,
.get_cmd = tcm_qla2xxx_get_cmd,
.get_cmd_ref = tcm_qla2xxx_get_cmd_ref,
.put_cmd_ref = tcm_qla2xxx_put_cmd_ref,
.rel_cmd = tcm_qla2xxx_rel_cmd,
.free_cmd = tcm_qla2xxx_free_cmd,
.free_mcmd = tcm_qla2xxx_free_mcmd,