Skip to content

Commit f6bc11e

Browse files
yishaihdavem330
authored andcommitted
net/mlx4_core: Enhance the catas flow to support device reset
This includes: - resetting the chip when a fatal error is detected (the current code does not do this). - exposing the ability to enter error state from outside the catas code by calling its functionality. (E.g. FW Command timeout, AER error). - managing a persistent device state. This is needed to sync between reset flow cases. Signed-off-by: Yishai Hadas <yishaih@mellanox.com> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent ad9a0bf commit f6bc11e

File tree

4 files changed

+108
-29
lines changed

4 files changed

+108
-29
lines changed

drivers/net/ethernet/mellanox/mlx4/catas.c

Lines changed: 94 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,83 @@ MODULE_PARM_DESC(internal_err_reset,
4848
"Reset device on internal errors if non-zero"
4949
" (default 1, in SRIOV mode default is 0)");
5050

51+
static int read_vendor_id(struct mlx4_dev *dev)
52+
{
53+
u16 vendor_id = 0;
54+
int ret;
55+
56+
ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id);
57+
if (ret) {
58+
mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret);
59+
return ret;
60+
}
61+
62+
if (vendor_id == 0xffff) {
63+
mlx4_err(dev, "PCI can't be accessed to read vendor id\n");
64+
return -EINVAL;
65+
}
66+
67+
return 0;
68+
}
69+
70+
static int mlx4_reset_master(struct mlx4_dev *dev)
71+
{
72+
int err = 0;
73+
74+
if (!pci_channel_offline(dev->persist->pdev)) {
75+
err = read_vendor_id(dev);
76+
/* If PCI can't be accessed to read vendor ID we assume that its
77+
* link was disabled and chip was already reset.
78+
*/
79+
if (err)
80+
return 0;
81+
82+
err = mlx4_reset(dev);
83+
if (err)
84+
mlx4_err(dev, "Fail to reset HCA\n");
85+
}
86+
87+
return err;
88+
}
89+
90+
void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
91+
{
92+
int err;
93+
struct mlx4_dev *dev;
94+
95+
if (!internal_err_reset)
96+
return;
97+
98+
mutex_lock(&persist->device_state_mutex);
99+
if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
100+
goto out;
101+
102+
dev = persist->dev;
103+
mlx4_err(dev, "device is going to be reset\n");
104+
err = mlx4_reset_master(dev);
105+
BUG_ON(err != 0);
106+
107+
dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
108+
mlx4_err(dev, "device was reset successfully\n");
109+
mutex_unlock(&persist->device_state_mutex);
110+
111+
/* At that step HW was already reset, now notify clients */
112+
mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
113+
return;
114+
115+
out:
116+
mutex_unlock(&persist->device_state_mutex);
117+
}
118+
119+
static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
120+
{
121+
int err = 0;
122+
123+
mlx4_enter_error_state(persist);
124+
err = mlx4_restart_one(persist->pdev);
125+
mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n", err);
126+
}
127+
51128
static void dump_err_buf(struct mlx4_dev *dev)
52129
{
53130
struct mlx4_priv *priv = mlx4_priv(dev);
@@ -66,42 +143,31 @@ static void poll_catas(unsigned long dev_ptr)
66143
struct mlx4_priv *priv = mlx4_priv(dev);
67144

68145
if (readl(priv->catas_err.map)) {
69-
/* If the device is off-line, we cannot try to recover it */
70-
if (pci_channel_offline(dev->persist->pdev))
71-
mod_timer(&priv->catas_err.timer,
72-
round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
73-
else {
74-
dump_err_buf(dev);
75-
mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
76-
77-
if (internal_err_reset)
78-
queue_work(dev->persist->catas_wq,
79-
&dev->persist->catas_work);
80-
}
81-
} else
82-
mod_timer(&priv->catas_err.timer,
83-
round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
146+
dump_err_buf(dev);
147+
goto internal_err;
148+
}
149+
150+
if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
151+
mlx4_warn(dev, "Internal error mark was detected on device\n");
152+
goto internal_err;
153+
}
154+
155+
mod_timer(&priv->catas_err.timer,
156+
round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
157+
return;
158+
159+
internal_err:
160+
if (internal_err_reset)
161+
queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
84162
}
85163

86164
static void catas_reset(struct work_struct *work)
87165
{
88166
struct mlx4_dev_persistent *persist =
89167
container_of(work, struct mlx4_dev_persistent,
90168
catas_work);
91-
struct pci_dev *pdev = persist->pdev;
92-
int ret;
93-
94-
/* If the device is off-line, we cannot reset it */
95-
if (pci_channel_offline(pdev))
96-
return;
97169

98-
ret = mlx4_restart_one(pdev);
99-
/* 'priv' now is not valid */
100-
if (ret)
101-
pr_err("mlx4 %s: Reset failed (%d)\n",
102-
pci_name(pdev), ret);
103-
else
104-
mlx4_dbg(persist->dev, "Reset succeeded\n");
170+
mlx4_handle_error_state(persist);
105171
}
106172

107173
void mlx4_start_catas_poll(struct mlx4_dev *dev)

drivers/net/ethernet/mellanox/mlx4/main.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2624,6 +2624,11 @@ static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
26242624
}
26252625
}
26262626

2627+
/* on load remove any previous indication of internal error,
2628+
* device is up.
2629+
*/
2630+
dev->persist->state = MLX4_DEVICE_STATE_UP;
2631+
26272632
slave_start:
26282633
err = mlx4_cmd_init(dev);
26292634
if (err) {
@@ -3108,6 +3113,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
31083113
dev->persist->dev = dev;
31093114
pci_set_drvdata(pdev, dev->persist);
31103115
priv->pci_dev_data = id->driver_data;
3116+
mutex_init(&dev->persist->device_state_mutex);
31113117

31123118
ret = __mlx4_init_one(pdev, id->driver_data, priv);
31133119
if (ret) {

drivers/net/ethernet/mellanox/mlx4/mlx4.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1178,7 +1178,7 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type);
11781178

11791179
void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
11801180

1181-
void mlx4_handle_catas_err(struct mlx4_dev *dev);
1181+
void mlx4_enter_error_state(struct mlx4_dev_persistent *persist);
11821182

11831183
int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
11841184
enum mlx4_port_type *type);

include/linux/mlx4/device.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,11 @@ enum {
411411
MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK = 1 << 4,
412412
};
413413

414+
enum {
415+
MLX4_DEVICE_STATE_UP = 1 << 0,
416+
MLX4_DEVICE_STATE_INTERNAL_ERROR = 1 << 1,
417+
};
418+
414419
#define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
415420
MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)
416421

@@ -753,6 +758,8 @@ struct mlx4_dev_persistent {
753758
enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1];
754759
struct work_struct catas_work;
755760
struct workqueue_struct *catas_wq;
761+
struct mutex device_state_mutex; /* protect HW state */
762+
u8 state;
756763
};
757764

758765
struct mlx4_dev {

0 commit comments

Comments
 (0)