-
Notifications
You must be signed in to change notification settings - Fork 24.9k
Description
I'm getting a Bus error (core dumped) when using the share_memory method on a model.
OS : Ubuntu 16.04
It's happening in python 2.7 and 3.5, conda environment and hard install. I'm using the latest version from http://pytorch.org/. I've also tried installing from source, same issue.
I tried doing a basic test using this code:
import torch.nn as nn
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(2563*50, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x)
n = Net()
n.share_memory()
print('okay')
If the input size is small it works fine, but anything greater than some threshold throws the Bus error. If I don't call share_memory() it works fine.
I ran trace, here are the last few lines of the output.
module.py(391): if module is not None and module not in memo: [567/1904]
module.py(392): memo.add(module)
module.py(393): yield name, module
module.py(378): yield module
module.py(118): module._apply(fn)
--- modulename: module, funcname: _apply
module.py(117): for module in self.children():
--- modulename: module, funcname: children
module.py(377): for name, module in self.named_children():
--- modulename: module, funcname: named_children
module.py(389): memo = set()
module.py(390): for name, module in self._modules.items():
module.py(120): for param in self._parameters.values():
module.py(121): if param is not None:
module.py(124): param.data = fn(param.data)
--- modulename: module, funcname: <lambda>
module.py(468): return self._apply(lambda t: t.share_memory_())
--- modulename: tensor, funcname: share_memory_
tensor.py(86): self.storage().share_memory_()
--- modulename: storage, funcname: share_memory_
storage.py(95): from torch.multiprocessing import get_sharing_strategy
--- modulename: _bootstrap, funcname: _handle_fromlist
<frozen importlib._bootstrap>(1006): <frozen importlib._bootstrap>(1007): <frozen importlib._bootstrap>(1012): <frozen importlib._bootstrap>(1013): <frozen importlib._bootstrap>(1012): <frozen importlib._bootstra
p>(1025): storage.py(96): if self.is_cuda:
storage.py(98): elif get_sharing_strategy() == 'file_system':
--- modulename: __init__, funcname: get_sharing_strategy
__init__.py(59): return _sharing_strategy
storage.py(101): self._share_fd_()
Bus error (core dumped)
I tried running gdb, but it wont give me a full trace.
I've tried creating a symbolic link to the libgomp.so.1 as I suspect it's a similar issue, but still the same error.
Any suggestions? This is running inside a docker container btw.