@@ -48,6 +48,83 @@ MODULE_PARM_DESC(internal_err_reset,
48
48
"Reset device on internal errors if non-zero"
49
49
" (default 1, in SRIOV mode default is 0)" );
50
50
51
+ static int read_vendor_id (struct mlx4_dev * dev )
52
+ {
53
+ u16 vendor_id = 0 ;
54
+ int ret ;
55
+
56
+ ret = pci_read_config_word (dev -> persist -> pdev , 0 , & vendor_id );
57
+ if (ret ) {
58
+ mlx4_err (dev , "Failed to read vendor ID, ret=%d\n" , ret );
59
+ return ret ;
60
+ }
61
+
62
+ if (vendor_id == 0xffff ) {
63
+ mlx4_err (dev , "PCI can't be accessed to read vendor id\n" );
64
+ return - EINVAL ;
65
+ }
66
+
67
+ return 0 ;
68
+ }
69
+
70
+ static int mlx4_reset_master (struct mlx4_dev * dev )
71
+ {
72
+ int err = 0 ;
73
+
74
+ if (!pci_channel_offline (dev -> persist -> pdev )) {
75
+ err = read_vendor_id (dev );
76
+ /* If PCI can't be accessed to read vendor ID we assume that its
77
+ * link was disabled and chip was already reset.
78
+ */
79
+ if (err )
80
+ return 0 ;
81
+
82
+ err = mlx4_reset (dev );
83
+ if (err )
84
+ mlx4_err (dev , "Fail to reset HCA\n" );
85
+ }
86
+
87
+ return err ;
88
+ }
89
+
90
+ void mlx4_enter_error_state (struct mlx4_dev_persistent * persist )
91
+ {
92
+ int err ;
93
+ struct mlx4_dev * dev ;
94
+
95
+ if (!internal_err_reset )
96
+ return ;
97
+
98
+ mutex_lock (& persist -> device_state_mutex );
99
+ if (persist -> state & MLX4_DEVICE_STATE_INTERNAL_ERROR )
100
+ goto out ;
101
+
102
+ dev = persist -> dev ;
103
+ mlx4_err (dev , "device is going to be reset\n" );
104
+ err = mlx4_reset_master (dev );
105
+ BUG_ON (err != 0 );
106
+
107
+ dev -> persist -> state |= MLX4_DEVICE_STATE_INTERNAL_ERROR ;
108
+ mlx4_err (dev , "device was reset successfully\n" );
109
+ mutex_unlock (& persist -> device_state_mutex );
110
+
111
+ /* At that step HW was already reset, now notify clients */
112
+ mlx4_dispatch_event (dev , MLX4_DEV_EVENT_CATASTROPHIC_ERROR , 0 );
113
+ return ;
114
+
115
+ out :
116
+ mutex_unlock (& persist -> device_state_mutex );
117
+ }
118
+
119
+ static void mlx4_handle_error_state (struct mlx4_dev_persistent * persist )
120
+ {
121
+ int err = 0 ;
122
+
123
+ mlx4_enter_error_state (persist );
124
+ err = mlx4_restart_one (persist -> pdev );
125
+ mlx4_info (persist -> dev , "mlx4_restart_one was ended, ret=%d\n" , err );
126
+ }
127
+
51
128
static void dump_err_buf (struct mlx4_dev * dev )
52
129
{
53
130
struct mlx4_priv * priv = mlx4_priv (dev );
@@ -66,42 +143,31 @@ static void poll_catas(unsigned long dev_ptr)
66
143
struct mlx4_priv * priv = mlx4_priv (dev );
67
144
68
145
if (readl (priv -> catas_err .map )) {
69
- /* If the device is off-line, we cannot try to recover it */
70
- if (pci_channel_offline (dev -> persist -> pdev ))
71
- mod_timer (& priv -> catas_err .timer ,
72
- round_jiffies (jiffies + MLX4_CATAS_POLL_INTERVAL ));
73
- else {
74
- dump_err_buf (dev );
75
- mlx4_dispatch_event (dev , MLX4_DEV_EVENT_CATASTROPHIC_ERROR , 0 );
76
-
77
- if (internal_err_reset )
78
- queue_work (dev -> persist -> catas_wq ,
79
- & dev -> persist -> catas_work );
80
- }
81
- } else
82
- mod_timer (& priv -> catas_err .timer ,
83
- round_jiffies (jiffies + MLX4_CATAS_POLL_INTERVAL ));
146
+ dump_err_buf (dev );
147
+ goto internal_err ;
148
+ }
149
+
150
+ if (dev -> persist -> state & MLX4_DEVICE_STATE_INTERNAL_ERROR ) {
151
+ mlx4_warn (dev , "Internal error mark was detected on device\n" );
152
+ goto internal_err ;
153
+ }
154
+
155
+ mod_timer (& priv -> catas_err .timer ,
156
+ round_jiffies (jiffies + MLX4_CATAS_POLL_INTERVAL ));
157
+ return ;
158
+
159
+ internal_err :
160
+ if (internal_err_reset )
161
+ queue_work (dev -> persist -> catas_wq , & dev -> persist -> catas_work );
84
162
}
85
163
86
164
static void catas_reset (struct work_struct * work )
87
165
{
88
166
struct mlx4_dev_persistent * persist =
89
167
container_of (work , struct mlx4_dev_persistent ,
90
168
catas_work );
91
- struct pci_dev * pdev = persist -> pdev ;
92
- int ret ;
93
-
94
- /* If the device is off-line, we cannot reset it */
95
- if (pci_channel_offline (pdev ))
96
- return ;
97
169
98
- ret = mlx4_restart_one (pdev );
99
- /* 'priv' now is not valid */
100
- if (ret )
101
- pr_err ("mlx4 %s: Reset failed (%d)\n" ,
102
- pci_name (pdev ), ret );
103
- else
104
- mlx4_dbg (persist -> dev , "Reset succeeded\n" );
170
+ mlx4_handle_error_state (persist );
105
171
}
106
172
107
173
void mlx4_start_catas_poll (struct mlx4_dev * dev )
0 commit comments