Skip to content

Commit b03e749

Browse files
Jon Masonjbarnes993
authored andcommitted
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
1 parent 5f66d2b commit b03e749

File tree

5 files changed

+236
-45
lines changed

5 files changed

+236
-45
lines changed

arch/x86/pci/acpi.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
360360
}
361361
}
362362

363+
/* After the PCI-E bus has been walked and all devices discovered,
364+
* configure any settings of the fabric that might be necessary.
365+
*/
366+
if (bus) {
367+
struct pci_bus *child;
368+
list_for_each_entry(child, &bus->children, node)
369+
pcie_bus_configure_settings(child, child->self->pcie_mpss);
370+
}
371+
363372
if (!bus)
364373
kfree(sd);
365374

drivers/pci/hotplug/pcihp_slot.c

Lines changed: 1 addition & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -158,47 +158,6 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp)
158158
*/
159159
}
160160

161-
/* Program PCIE MaxPayload setting on device: ensure parent maxpayload <= device */
162-
static int pci_set_payload(struct pci_dev *dev)
163-
{
164-
int pos, ppos;
165-
u16 pctl, psz;
166-
u16 dctl, dsz, dcap, dmax;
167-
struct pci_dev *parent;
168-
169-
parent = dev->bus->self;
170-
pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
171-
if (!pos)
172-
return 0;
173-
174-
/* Read Device MaxPayload capability and setting */
175-
pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &dctl);
176-
pci_read_config_word(dev, pos + PCI_EXP_DEVCAP, &dcap);
177-
dsz = (dctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
178-
dmax = (dcap & PCI_EXP_DEVCAP_PAYLOAD);
179-
180-
/* Read Parent MaxPayload setting */
181-
ppos = pci_find_capability(parent, PCI_CAP_ID_EXP);
182-
if (!ppos)
183-
return 0;
184-
pci_read_config_word(parent, ppos + PCI_EXP_DEVCTL, &pctl);
185-
psz = (pctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
186-
187-
/* If parent payload > device max payload -> error
188-
* If parent payload > device payload -> set speed
189-
* If parent payload <= device payload -> do nothing
190-
*/
191-
if (psz > dmax)
192-
return -1;
193-
else if (psz > dsz) {
194-
dev_info(&dev->dev, "Setting MaxPayload to %d\n", 128 << psz);
195-
pci_write_config_word(dev, pos + PCI_EXP_DEVCTL,
196-
(dctl & ~PCI_EXP_DEVCTL_PAYLOAD) +
197-
(psz << 5));
198-
}
199-
return 0;
200-
}
201-
202161
void pci_configure_slot(struct pci_dev *dev)
203162
{
204163
struct pci_dev *cdev;
@@ -210,9 +169,7 @@ void pci_configure_slot(struct pci_dev *dev)
210169
(dev->class >> 8) == PCI_CLASS_BRIDGE_PCI)))
211170
return;
212171

213-
ret = pci_set_payload(dev);
214-
if (ret)
215-
dev_warn(&dev->dev, "could not set device max payload\n");
172+
pcie_bus_configure_settings(dev->bus, dev->bus->self->pcie_mpss);
216173

217174
memset(&hpp, 0, sizeof(hpp));
218175
ret = pci_get_hp_params(dev, &hpp);

drivers/pci/pci.c

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE;
7777
unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE;
7878
unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE;
7979

80+
enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE;
81+
8082
/*
8183
* The default CLS is used if arch didn't set CLS explicitly and not
8284
* all pci devices agree on the same value. Arch can override either
@@ -3222,6 +3224,67 @@ int pcie_set_readrq(struct pci_dev *dev, int rq)
32223224
}
32233225
EXPORT_SYMBOL(pcie_set_readrq);
32243226

3227+
/**
3228+
* pcie_get_mps - get PCI Express maximum payload size
3229+
* @dev: PCI device to query
3230+
*
3231+
* Returns maximum payload size in bytes
3232+
* or appropriate error value.
3233+
*/
3234+
int pcie_get_mps(struct pci_dev *dev)
3235+
{
3236+
int ret, cap;
3237+
u16 ctl;
3238+
3239+
cap = pci_pcie_cap(dev);
3240+
if (!cap)
3241+
return -EINVAL;
3242+
3243+
ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
3244+
if (!ret)
3245+
ret = 128 << ((ctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5);
3246+
3247+
return ret;
3248+
}
3249+
3250+
/**
3251+
* pcie_set_mps - set PCI Express maximum payload size
3252+
* @dev: PCI device to query
3253+
* @rq: maximum payload size in bytes
3254+
* valid values are 128, 256, 512, 1024, 2048, 4096
3255+
*
3256+
* If possible sets maximum payload size
3257+
*/
3258+
int pcie_set_mps(struct pci_dev *dev, int mps)
3259+
{
3260+
int cap, err = -EINVAL;
3261+
u16 ctl, v;
3262+
3263+
if (mps < 128 || mps > 4096 || !is_power_of_2(mps))
3264+
goto out;
3265+
3266+
v = ffs(mps) - 8;
3267+
if (v > dev->pcie_mpss)
3268+
goto out;
3269+
v <<= 5;
3270+
3271+
cap = pci_pcie_cap(dev);
3272+
if (!cap)
3273+
goto out;
3274+
3275+
err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
3276+
if (err)
3277+
goto out;
3278+
3279+
if ((ctl & PCI_EXP_DEVCTL_PAYLOAD) != v) {
3280+
ctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
3281+
ctl |= v;
3282+
err = pci_write_config_word(dev, cap + PCI_EXP_DEVCTL, ctl);
3283+
}
3284+
out:
3285+
return err;
3286+
}
3287+
32253288
/**
32263289
* pci_select_bars - Make BAR mask from the type of resource
32273290
* @dev: the PCI device for which BAR mask is made
@@ -3505,6 +3568,10 @@ static int __init pci_setup(char *str)
35053568
pci_hotplug_io_size = memparse(str + 9, &str);
35063569
} else if (!strncmp(str, "hpmemsize=", 10)) {
35073570
pci_hotplug_mem_size = memparse(str + 10, &str);
3571+
} else if (!strncmp(str, "pcie_bus_safe", 13)) {
3572+
pcie_bus_config = PCIE_BUS_SAFE;
3573+
} else if (!strncmp(str, "pcie_bus_perf", 13)) {
3574+
pcie_bus_config = PCIE_BUS_PERFORMANCE;
35083575
} else {
35093576
printk(KERN_ERR "PCI: Unknown option `%s'\n",
35103577
str);

drivers/pci/probe.c

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,8 @@ void set_pcie_port_type(struct pci_dev *pdev)
856856
pdev->pcie_cap = pos;
857857
pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
858858
pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
859+
pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16);
860+
pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
859861
}
860862

861863
void set_pcie_hotplug_bridge(struct pci_dev *pdev)
@@ -1326,6 +1328,149 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
13261328
return nr;
13271329
}
13281330

1331+
static int pcie_find_smpss(struct pci_dev *dev, void *data)
1332+
{
1333+
u8 *smpss = data;
1334+
1335+
if (!pci_is_pcie(dev))
1336+
return 0;
1337+
1338+
/* For PCIE hotplug enabled slots not connected directly to a
1339+
* PCI-E root port, there can be problems when hotplugging
1340+
* devices. This is due to the possibility of hotplugging a
1341+
* device into the fabric with a smaller MPS that the devices
1342+
* currently running have configured. Modifying the MPS on the
1343+
* running devices could cause a fatal bus error due to an
1344+
* incoming frame being larger than the newly configured MPS.
1345+
* To work around this, the MPS for the entire fabric must be
1346+
* set to the minimum size. Any devices hotplugged into this
1347+
* fabric will have the minimum MPS set. If the PCI hotplug
1348+
* slot is directly connected to the root port and there are not
1349+
* other devices on the fabric (which seems to be the most
1350+
* common case), then this is not an issue and MPS discovery
1351+
* will occur as normal.
1352+
*/
1353+
if (dev->is_hotplug_bridge && (!list_is_singular(&dev->bus->devices) ||
1354+
dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT))
1355+
*smpss = 0;
1356+
1357+
if (*smpss > dev->pcie_mpss)
1358+
*smpss = dev->pcie_mpss;
1359+
1360+
return 0;
1361+
}
1362+
1363+
static void pcie_write_mps(struct pci_dev *dev, int mps)
1364+
{
1365+
int rc, dev_mpss;
1366+
1367+
dev_mpss = 128 << dev->pcie_mpss;
1368+
1369+
if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
1370+
if (dev->bus->self) {
1371+
dev_dbg(&dev->bus->dev, "Bus MPSS %d\n",
1372+
128 << dev->bus->self->pcie_mpss);
1373+
1374+
/* For "MPS Force Max", the assumption is made that
1375+
* downstream communication will never be larger than
1376+
* the MRRS. So, the MPS only needs to be configured
1377+
* for the upstream communication. This being the case,
1378+
* walk from the top down and set the MPS of the child
1379+
* to that of the parent bus.
1380+
*/
1381+
mps = 128 << dev->bus->self->pcie_mpss;
1382+
if (mps > dev_mpss)
1383+
dev_warn(&dev->dev, "MPS configured higher than"
1384+
" maximum supported by the device. If"
1385+
" a bus issue occurs, try running with"
1386+
" pci=pcie_bus_safe.\n");
1387+
}
1388+
1389+
dev->pcie_mpss = ffs(mps) - 8;
1390+
}
1391+
1392+
rc = pcie_set_mps(dev, mps);
1393+
if (rc)
1394+
dev_err(&dev->dev, "Failed attempting to set the MPS\n");
1395+
}
1396+
1397+
static void pcie_write_mrrs(struct pci_dev *dev, int mps)
1398+
{
1399+
int rc, mrrs;
1400+
1401+
if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
1402+
int dev_mpss = 128 << dev->pcie_mpss;
1403+
1404+
/* For Max performance, the MRRS must be set to the largest
1405+
* supported value. However, it cannot be configured larger
1406+
* than the MPS the device or the bus can support. This assumes
1407+
* that the largest MRRS available on the device cannot be
1408+
* smaller than the device MPSS.
1409+
*/
1410+
mrrs = mps < dev_mpss ? mps : dev_mpss;
1411+
} else
1412+
/* In the "safe" case, configure the MRRS for fairness on the
1413+
* bus by making all devices have the same size
1414+
*/
1415+
mrrs = mps;
1416+
1417+
1418+
/* MRRS is a R/W register. Invalid values can be written, but a
1419+
* subsiquent read will verify if the value is acceptable or not.
1420+
* If the MRRS value provided is not acceptable (e.g., too large),
1421+
* shrink the value until it is acceptable to the HW.
1422+
*/
1423+
while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) {
1424+
rc = pcie_set_readrq(dev, mrrs);
1425+
if (rc)
1426+
dev_err(&dev->dev, "Failed attempting to set the MRRS\n");
1427+
1428+
mrrs /= 2;
1429+
}
1430+
}
1431+
1432+
static int pcie_bus_configure_set(struct pci_dev *dev, void *data)
1433+
{
1434+
int mps = 128 << *(u8 *)data;
1435+
1436+
if (!pci_is_pcie(dev))
1437+
return 0;
1438+
1439+
dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
1440+
pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
1441+
1442+
pcie_write_mps(dev, mps);
1443+
pcie_write_mrrs(dev, mps);
1444+
1445+
dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
1446+
pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
1447+
1448+
return 0;
1449+
}
1450+
1451+
/* pcie_bus_configure_mps requires that pci_walk_bus work in a top-down,
1452+
* parents then children fashion. If this changes, then this code will not
1453+
* work as designed.
1454+
*/
1455+
void pcie_bus_configure_settings(struct pci_bus *bus, u8 mpss)
1456+
{
1457+
u8 smpss = mpss;
1458+
1459+
if (!bus->self)
1460+
return;
1461+
1462+
if (!pci_is_pcie(bus->self))
1463+
return;
1464+
1465+
if (pcie_bus_config == PCIE_BUS_SAFE) {
1466+
pcie_find_smpss(bus->self, &smpss);
1467+
pci_walk_bus(bus, pcie_find_smpss, &smpss);
1468+
}
1469+
1470+
pcie_bus_configure_set(bus->self, &smpss);
1471+
pci_walk_bus(bus, pcie_bus_configure_set, &smpss);
1472+
}
1473+
13291474
unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
13301475
{
13311476
unsigned int devfn, pass, max = bus->secondary;

include/linux/pci.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,8 @@ struct pci_dev {
251251
u8 revision; /* PCI revision, low byte of class word */
252252
u8 hdr_type; /* PCI header type (`multi' flag masked out) */
253253
u8 pcie_cap; /* PCI-E capability offset */
254-
u8 pcie_type; /* PCI-E device/port type */
254+
u8 pcie_type:4; /* PCI-E device/port type */
255+
u8 pcie_mpss:3; /* PCI-E Max Payload Size Supported */
255256
u8 rom_base_reg; /* which config register controls the ROM */
256257
u8 pin; /* which interrupt pin this device uses */
257258

@@ -617,6 +618,16 @@ struct pci_driver {
617618
/* these external functions are only available when PCI support is enabled */
618619
#ifdef CONFIG_PCI
619620

621+
extern void pcie_bus_configure_settings(struct pci_bus *bus, u8 smpss);
622+
623+
enum pcie_bus_config_types {
624+
PCIE_BUS_PERFORMANCE,
625+
PCIE_BUS_SAFE,
626+
PCIE_BUS_PEER2PEER,
627+
};
628+
629+
extern enum pcie_bus_config_types pcie_bus_config;
630+
620631
extern struct bus_type pci_bus_type;
621632

622633
/* Do NOT directly access these two variables, unless you are arch specific pci
@@ -796,6 +807,8 @@ int pcix_get_mmrbc(struct pci_dev *dev);
796807
int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
797808
int pcie_get_readrq(struct pci_dev *dev);
798809
int pcie_set_readrq(struct pci_dev *dev, int rq);
810+
int pcie_get_mps(struct pci_dev *dev);
811+
int pcie_set_mps(struct pci_dev *dev, int mps);
799812
int __pci_reset_function(struct pci_dev *dev);
800813
int pci_reset_function(struct pci_dev *dev);
801814
void pci_update_resource(struct pci_dev *dev, int resno);

0 commit comments

Comments
 (0)