Skip to content

Commit 32517fc

Browse files
jsmart-ghmartinkpetersen
authored andcommitted
scsi: lpfc: Rework EQ/CQ processing to address interrupt coalescing
When driving high iop counts, auto_imax coalescing kicks in and drives the performance to extremely small iops levels. There are two issues: 1) auto_imax is enabled by default. The auto algorithm, when iops gets high, divides the iops by the hdwq count and uses that value to calculate EQ_Delay. The EQ_Delay is set uniformly on all EQs whether they have load or not. The EQ_delay is only manipulated every 5s (a long time). Thus there were large 5s swings of no interrupt delay followed by large/maximum delay, before repeating. 2) When processing a CQ, the driver got mixed up on the rate of when to ring the doorbell to keep the chip appraised of the eqe or cqe consumption as well as how how long to sit in the thread and process queue entries. Currently, the driver capped its work at 64 entries (very small) and exited/rearmed the CQ. Thus, on heavy loads, additional overheads were taken to exit and re-enter the interrupt handler. Worse, if in the large/maximum coalescing windows,k it could be a while before getting back to servicing. The issues are corrected by the following: - A change in defaults. Auto_imax is turned OFF and fcp_imax is set to 0. Thus all interrupts are immediate. - Cleanup of field names and their meanings. Existing names were non-intuitive or used for duplicate things. - Added max_proc_limit field, to control the length of time the handlers would service completions. - Reworked EQ handling: Added common routine that walks eq, applying notify interval and max processing limits. Use queue_claimed to claim ownership of the queue while processing. Always rearm the queue whenever the common routine is called. Rework queue element processing, namely to eliminate hba_index vs host_index. Only one index is necessary. The queue entry can be marked invalid and the host_index updated immediately after eqe processing. After rework, xx_release routines are now DB write functions. Renamed the routines as such. Moved lpfc_sli4_eq_flush(), which does similar action, to same area. Replaced the 2 individual loops that walk an eq with a call to the common routine. Slightly revised lpfc_sli4_hba_handle_eqe() calling syntax. Added per-cpu counters to detect interrupt rates and scale interrupt coalescing values. - Reworked CQ handling: Added common routine that walks cq, applying notify interval and max processing limits. Use queue_claimed to claim ownership of the queue while processing. Always rearm the queue whenever the common routine is called. Rework queue element processing, namely to eliminate hba_index vs host_index. Only one index is necessary. The queue entry can be marked invalid and the host_index updated immediately after cqe processing. After rework, xx_release routines are now DB write functions. Renamed the routines as such. Replaced the 3 individual loops that walk a cq with a call to the common routine. Redefined lpfc_sli4_sp_handle_mcqe() to commong handler definition with queue reference. Add increment for mbox completion to handler. - Added a new module/sysfs attribute: lpfc_cq_max_proc_limit To allow dynamic changing of the CQ max_proc_limit value being used. Although this leaves an EQ as an immediate interrupt, that interrupt will only occur if a CQ bound to it is in an armed state and has cqe's to process. By staying in the cq processing routine longer, high loads will avoid generating more interrupts as they will only rearm as the processing thread exits. The immediately interrupt is also beneficial to idle or lower-processing CQ's as they get serviced immediately without being penalized by sharing an EQ with a more loaded CQ. Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com> Signed-off-by: James Smart <jsmart2021@gmail.com> Reviewed-by: Hannes Reinecke <hare@suse.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
1 parent cb733e3 commit 32517fc

File tree

7 files changed

+729
-476
lines changed

7 files changed

+729
-476
lines changed

drivers/scsi/lpfc/lpfc.h

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -686,6 +686,7 @@ struct lpfc_hba {
686686
struct lpfc_sli4_hba sli4_hba;
687687

688688
struct workqueue_struct *wq;
689+
struct delayed_work eq_delay_work;
689690

690691
struct lpfc_sli sli;
691692
uint8_t pci_dev_grp; /* lpfc PCI dev group: 0x0, 0x1, 0x2,... */
@@ -789,7 +790,6 @@ struct lpfc_hba {
789790
uint8_t nvmet_support; /* driver supports NVMET */
790791
#define LPFC_NVMET_MAX_PORTS 32
791792
uint8_t mds_diags_support;
792-
uint32_t initial_imax;
793793
uint8_t bbcredit_support;
794794
uint8_t enab_exp_wqcq_pages;
795795

@@ -817,6 +817,8 @@ struct lpfc_hba {
817817
uint32_t cfg_use_msi;
818818
uint32_t cfg_auto_imax;
819819
uint32_t cfg_fcp_imax;
820+
uint32_t cfg_cq_poll_threshold;
821+
uint32_t cfg_cq_max_proc_limit;
820822
uint32_t cfg_fcp_cpu_map;
821823
uint32_t cfg_hdw_queue;
822824
uint32_t cfg_irq_chann;
@@ -1084,7 +1086,6 @@ struct lpfc_hba {
10841086

10851087
uint8_t temp_sensor_support;
10861088
/* Fields used for heart beat. */
1087-
unsigned long last_eqdelay_time;
10881089
unsigned long last_completion_time;
10891090
unsigned long skipped_hb;
10901091
struct timer_list hb_tmofunc;
@@ -1287,3 +1288,23 @@ lpfc_phba_elsring(struct lpfc_hba *phba)
12871288
}
12881289
return &phba->sli.sli3_ring[LPFC_ELS_RING];
12891290
}
1291+
1292+
/**
1293+
* lpfc_sli4_mod_hba_eq_delay - update EQ delay
1294+
* @phba: Pointer to HBA context object.
1295+
* @q: The Event Queue to update.
1296+
* @delay: The delay value (in us) to be written.
1297+
*
1298+
**/
1299+
static inline void
1300+
lpfc_sli4_mod_hba_eq_delay(struct lpfc_hba *phba, struct lpfc_queue *eq,
1301+
u32 delay)
1302+
{
1303+
struct lpfc_register reg_data;
1304+
1305+
reg_data.word0 = 0;
1306+
bf_set(lpfc_sliport_eqdelay_id, &reg_data, eq->queue_id);
1307+
bf_set(lpfc_sliport_eqdelay_delay, &reg_data, delay);
1308+
writel(reg_data.word0, phba->sli4_hba.u.if_type2.EQDregaddr);
1309+
eq->q_mode = delay;
1310+
}

drivers/scsi/lpfc/lpfc_attr.c

Lines changed: 128 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4935,6 +4935,7 @@ lpfc_fcp_imax_store(struct device *dev, struct device_attribute *attr,
49354935
struct Scsi_Host *shost = class_to_shost(dev);
49364936
struct lpfc_vport *vport = (struct lpfc_vport *)shost->hostdata;
49374937
struct lpfc_hba *phba = vport->phba;
4938+
struct lpfc_eq_intr_info *eqi;
49384939
uint32_t usdelay;
49394940
int val = 0, i;
49404941

@@ -4956,8 +4957,18 @@ lpfc_fcp_imax_store(struct device *dev, struct device_attribute *attr,
49564957
if (val && (val < LPFC_MIN_IMAX || val > LPFC_MAX_IMAX))
49574958
return -EINVAL;
49584959

4960+
phba->cfg_auto_imax = (val) ? 0 : 1;
4961+
if (phba->cfg_fcp_imax && !val) {
4962+
queue_delayed_work(phba->wq, &phba->eq_delay_work,
4963+
msecs_to_jiffies(LPFC_EQ_DELAY_MSECS));
4964+
4965+
for_each_present_cpu(i) {
4966+
eqi = per_cpu_ptr(phba->sli4_hba.eq_info, i);
4967+
eqi->icnt = 0;
4968+
}
4969+
}
4970+
49594971
phba->cfg_fcp_imax = (uint32_t)val;
4960-
phba->initial_imax = phba->cfg_fcp_imax;
49614972

49624973
if (phba->cfg_fcp_imax)
49634974
usdelay = LPFC_SEC_TO_USEC / phba->cfg_fcp_imax;
@@ -5020,15 +5031,119 @@ lpfc_fcp_imax_init(struct lpfc_hba *phba, int val)
50205031

50215032
static DEVICE_ATTR_RW(lpfc_fcp_imax);
50225033

5034+
/**
5035+
* lpfc_cq_max_proc_limit_store
5036+
*
5037+
* @dev: class device that is converted into a Scsi_host.
5038+
* @attr: device attribute, not used.
5039+
* @buf: string with the cq max processing limit of cqes
5040+
* @count: unused variable.
5041+
*
5042+
* Description:
5043+
* If val is in a valid range, then set value on each cq
5044+
*
5045+
* Returns:
5046+
* The length of the buf: if successful
5047+
* -ERANGE: if val is not in the valid range
5048+
* -EINVAL: if bad value format or intended mode is not supported.
5049+
**/
5050+
static ssize_t
5051+
lpfc_cq_max_proc_limit_store(struct device *dev, struct device_attribute *attr,
5052+
const char *buf, size_t count)
5053+
{
5054+
struct Scsi_Host *shost = class_to_shost(dev);
5055+
struct lpfc_vport *vport = (struct lpfc_vport *)shost->hostdata;
5056+
struct lpfc_hba *phba = vport->phba;
5057+
struct lpfc_queue *eq, *cq;
5058+
unsigned long val;
5059+
int i;
5060+
5061+
/* cq_max_proc_limit is only valid for SLI4 */
5062+
if (phba->sli_rev != LPFC_SLI_REV4)
5063+
return -EINVAL;
5064+
5065+
/* Sanity check on user data */
5066+
if (!isdigit(buf[0]))
5067+
return -EINVAL;
5068+
if (kstrtoul(buf, 0, &val))
5069+
return -EINVAL;
5070+
5071+
if (val < LPFC_CQ_MIN_PROC_LIMIT || val > LPFC_CQ_MAX_PROC_LIMIT)
5072+
return -ERANGE;
5073+
5074+
phba->cfg_cq_max_proc_limit = (uint32_t)val;
5075+
5076+
/* set the values on the cq's */
5077+
for (i = 0; i < phba->cfg_irq_chann; i++) {
5078+
eq = phba->sli4_hba.hdwq[i].hba_eq;
5079+
if (!eq)
5080+
continue;
5081+
5082+
list_for_each_entry(cq, &eq->child_list, list)
5083+
cq->max_proc_limit = min(phba->cfg_cq_max_proc_limit,
5084+
cq->entry_count);
5085+
}
5086+
5087+
return strlen(buf);
5088+
}
5089+
50235090
/*
5024-
* lpfc_auto_imax: Controls Auto-interrupt coalescing values support.
5025-
* 0 No auto_imax support
5026-
* 1 auto imax on
5027-
* Auto imax will change the value of fcp_imax on a per EQ basis, using
5028-
* the EQ Delay Multiplier, depending on the activity for that EQ.
5029-
* Value range [0,1]. Default value is 1.
5091+
* lpfc_cq_max_proc_limit: The maximum number CQE entries processed in an
5092+
* itteration of CQ processing.
50305093
*/
5031-
LPFC_ATTR_RW(auto_imax, 1, 0, 1, "Enable Auto imax");
5094+
static int lpfc_cq_max_proc_limit = LPFC_CQ_DEF_MAX_PROC_LIMIT;
5095+
module_param(lpfc_cq_max_proc_limit, int, 0644);
5096+
MODULE_PARM_DESC(lpfc_cq_max_proc_limit,
5097+
"Set the maximum number CQEs processed in an iteration of "
5098+
"CQ processing");
5099+
lpfc_param_show(cq_max_proc_limit)
5100+
5101+
/*
5102+
* lpfc_cq_poll_threshold: Set the threshold of CQE completions in a
5103+
* single handler call which should request a polled completion rather
5104+
* than re-enabling interrupts.
5105+
*/
5106+
LPFC_ATTR_RW(cq_poll_threshold, LPFC_CQ_DEF_THRESHOLD_TO_POLL,
5107+
LPFC_CQ_MIN_THRESHOLD_TO_POLL,
5108+
LPFC_CQ_MAX_THRESHOLD_TO_POLL,
5109+
"CQE Processing Threshold to enable Polling");
5110+
5111+
/**
5112+
* lpfc_cq_max_proc_limit_init - Set the initial cq max_proc_limit
5113+
* @phba: lpfc_hba pointer.
5114+
* @val: entry limit
5115+
*
5116+
* Description:
5117+
* If val is in a valid range, then initialize the adapter's maximum
5118+
* value.
5119+
*
5120+
* Returns:
5121+
* Always returns 0 for success, even if value not always set to
5122+
* requested value. If value out of range or not supported, will fall
5123+
* back to default.
5124+
**/
5125+
static int
5126+
lpfc_cq_max_proc_limit_init(struct lpfc_hba *phba, int val)
5127+
{
5128+
phba->cfg_cq_max_proc_limit = LPFC_CQ_DEF_MAX_PROC_LIMIT;
5129+
5130+
if (phba->sli_rev != LPFC_SLI_REV4)
5131+
return 0;
5132+
5133+
if (val >= LPFC_CQ_MIN_PROC_LIMIT && val <= LPFC_CQ_MAX_PROC_LIMIT) {
5134+
phba->cfg_cq_max_proc_limit = val;
5135+
return 0;
5136+
}
5137+
5138+
lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
5139+
"0371 "LPFC_DRIVER_NAME"_cq_max_proc_limit: "
5140+
"%d out of range, using default\n",
5141+
phba->cfg_cq_max_proc_limit);
5142+
5143+
return 0;
5144+
}
5145+
5146+
static DEVICE_ATTR_RW(lpfc_cq_max_proc_limit);
50325147

50335148
/**
50345149
* lpfc_state_show - Display current driver CPU affinity
@@ -5788,8 +5903,9 @@ struct device_attribute *lpfc_hba_attrs[] = {
57885903
&dev_attr_lpfc_use_msi,
57895904
&dev_attr_lpfc_nvme_oas,
57905905
&dev_attr_lpfc_nvme_embed_cmd,
5791-
&dev_attr_lpfc_auto_imax,
57925906
&dev_attr_lpfc_fcp_imax,
5907+
&dev_attr_lpfc_cq_poll_threshold,
5908+
&dev_attr_lpfc_cq_max_proc_limit,
57935909
&dev_attr_lpfc_fcp_cpu_map,
57945910
&dev_attr_lpfc_hdw_queue,
57955911
&dev_attr_lpfc_irq_chann,
@@ -6834,8 +6950,9 @@ lpfc_get_cfgparam(struct lpfc_hba *phba)
68346950
lpfc_use_msi_init(phba, lpfc_use_msi);
68356951
lpfc_nvme_oas_init(phba, lpfc_nvme_oas);
68366952
lpfc_nvme_embed_cmd_init(phba, lpfc_nvme_embed_cmd);
6837-
lpfc_auto_imax_init(phba, lpfc_auto_imax);
68386953
lpfc_fcp_imax_init(phba, lpfc_fcp_imax);
6954+
lpfc_cq_poll_threshold_init(phba, lpfc_cq_poll_threshold);
6955+
lpfc_cq_max_proc_limit_init(phba, lpfc_cq_max_proc_limit);
68396956
lpfc_fcp_cpu_map_init(phba, lpfc_fcp_cpu_map);
68406957
lpfc_enable_hba_reset_init(phba, lpfc_enable_hba_reset);
68416958
lpfc_enable_hba_heartbeat_init(phba, lpfc_enable_hba_heartbeat);
@@ -6888,9 +7005,7 @@ lpfc_get_cfgparam(struct lpfc_hba *phba)
68887005
phba->cfg_enable_fc4_type |= LPFC_ENABLE_FCP;
68897006
}
68907007

6891-
if (phba->cfg_auto_imax && !phba->cfg_fcp_imax)
6892-
phba->cfg_auto_imax = 0;
6893-
phba->initial_imax = phba->cfg_fcp_imax;
7008+
phba->cfg_auto_imax = (phba->cfg_fcp_imax) ? 0 : 1;
68947009

68957010
phba->cfg_enable_pbde = 0;
68967011

drivers/scsi/lpfc/lpfc_debugfs.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3766,10 +3766,10 @@ __lpfc_idiag_print_wq(struct lpfc_queue *qp, char *wqtype,
37663766
(unsigned long long)qp->q_cnt_4);
37673767
len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len,
37683768
"\t\tWQID[%02d], QE-CNT[%04d], QE-SZ[%04d], "
3769-
"HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]",
3769+
"HST-IDX[%04d], PRT-IDX[%04d], NTFI[%03d]",
37703770
qp->queue_id, qp->entry_count,
37713771
qp->entry_size, qp->host_index,
3772-
qp->hba_index, qp->entry_repost);
3772+
qp->hba_index, qp->notify_interval);
37733773
len += snprintf(pbuffer + len,
37743774
LPFC_QUE_INFO_GET_BUF_SIZE - len, "\n");
37753775
return len;
@@ -3819,10 +3819,10 @@ __lpfc_idiag_print_cq(struct lpfc_queue *qp, char *cqtype,
38193819
qp->q_cnt_3, (unsigned long long)qp->q_cnt_4);
38203820
len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len,
38213821
"\tCQID[%02d], QE-CNT[%04d], QE-SZ[%04d], "
3822-
"HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]",
3822+
"HST-IDX[%04d], NTFI[%03d], PLMT[%03d]",
38233823
qp->queue_id, qp->entry_count,
38243824
qp->entry_size, qp->host_index,
3825-
qp->hba_index, qp->entry_repost);
3825+
qp->notify_interval, qp->max_proc_limit);
38263826

38273827
len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "\n");
38283828

@@ -3845,15 +3845,15 @@ __lpfc_idiag_print_rqpair(struct lpfc_queue *qp, struct lpfc_queue *datqp,
38453845
qp->q_cnt_3, (unsigned long long)qp->q_cnt_4);
38463846
len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len,
38473847
"\t\tHQID[%02d], QE-CNT[%04d], QE-SZ[%04d], "
3848-
"HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]\n",
3848+
"HST-IDX[%04d], PRT-IDX[%04d], NTFI[%03d]\n",
38493849
qp->queue_id, qp->entry_count, qp->entry_size,
3850-
qp->host_index, qp->hba_index, qp->entry_repost);
3850+
qp->host_index, qp->hba_index, qp->notify_interval);
38513851
len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len,
38523852
"\t\tDQID[%02d], QE-CNT[%04d], QE-SZ[%04d], "
3853-
"HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]\n",
3853+
"HST-IDX[%04d], PRT-IDX[%04d], NTFI[%03d]\n",
38543854
datqp->queue_id, datqp->entry_count,
38553855
datqp->entry_size, datqp->host_index,
3856-
datqp->hba_index, datqp->entry_repost);
3856+
datqp->hba_index, datqp->notify_interval);
38573857
return len;
38583858
}
38593859

@@ -3934,10 +3934,10 @@ __lpfc_idiag_print_eq(struct lpfc_queue *qp, char *eqtype,
39343934
(unsigned long long)qp->q_cnt_4, qp->q_mode);
39353935
len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len,
39363936
"EQID[%02d], QE-CNT[%04d], QE-SZ[%04d], "
3937-
"HST-IDX[%04d], PRT-IDX[%04d], PST[%03d] AFFIN[%03d]",
3937+
"HST-IDX[%04d], NTFI[%03d], PLMT[%03d], AFFIN[%03d]",
39383938
qp->queue_id, qp->entry_count, qp->entry_size,
3939-
qp->host_index, qp->hba_index, qp->entry_repost,
3940-
qp->chann);
3939+
qp->host_index, qp->notify_interval,
3940+
qp->max_proc_limit, qp->chann);
39413941
len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "\n");
39423942

39433943
return len;

drivers/scsi/lpfc/lpfc_hw4.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,14 @@ struct lpfc_sli_intf {
208208
/* Configuration of Interrupts / sec for entire HBA port */
209209
#define LPFC_MIN_IMAX 5000
210210
#define LPFC_MAX_IMAX 5000000
211-
#define LPFC_DEF_IMAX 150000
211+
#define LPFC_DEF_IMAX 0
212+
213+
#define LPFC_IMAX_THRESHOLD 1000
214+
#define LPFC_MAX_AUTO_EQ_DELAY 120
215+
#define LPFC_EQ_DELAY_STEP 15
216+
#define LPFC_EQD_ISR_TRIGGER 20000
217+
/* 1s intervals */
218+
#define LPFC_EQ_DELAY_MSECS 1000
212219

213220
#define LPFC_MIN_CPU_MAP 0
214221
#define LPFC_MAX_CPU_MAP 1

0 commit comments

Comments
 (0)