TypeError: exceptions must derive from BaseException
iGuaZi opened this issue · comments
Thanks for reaching out.
Could you try version except torch 1.12 or 2.0, e.g. torch 1.13 or 1.11? Torch 1.12 has a bug when extracting trace graph that failed a lot open-source libraries. Torch 2.0 is too new that I did not test yet.
@iGuaZi I recently fixed the Type._C.Node issues, which happened for a few specific torch versions. Please try again. Thanks.
Glad that you succeeded the tutorial. Could you please share your personal network for me for a quick look? If confidential, you could share via email Tianyi.Chen@microsoft.com. I could then take a quick look.
We support sgd
, Adam
and adamw
at present. Please see https://github.com/tianyic/only_train_once/issues/12 for more details.
minimal case. please take a look, thx~
import numpy as np
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
import torch.nn.functional as F
def Normalize(X, N):
mean = N[0]
std = N[1]
return (X - mean) / std
def Renormalize(X, N):
mean = N[0]
std = N[1]
return (X * std) + mean
class Model(torch.nn.Module):
def __init__(self,
gating_indices,
gating_input,
gating_hidden,
gating_output,
main_indices,
main_input,
main_hidden,
main_output,
dropout,
input_norm,
output_norm
):
super(Model, self).__init__()
if len(gating_indices) + len(main_indices) != len(input_norm[0]):
print("Warning: Number of gating features (" + str(len(gating_indices)) + ") and main features (" + str(len(main_indices)) + ") are not the same as input features (" + str(len(input_norm[0])) + ").")
self.gating_indices = gating_indices
self.main_indices = main_indices
self.G1 = nn.Linear(gating_input, gating_hidden)
self.G2 = nn.Linear(gating_hidden, gating_hidden)
self.G3 = nn.Linear(gating_hidden, gating_output)
self.E1 = ExpertLinear(gating_output, main_input, main_hidden)
self.E2 = ExpertLinear(gating_output, main_hidden, main_hidden)
self.E3 = ExpertLinear(gating_output, main_hidden, main_output)
self.dropout = dropout
self.Xnorm = Parameter(torch.from_numpy(input_norm), requires_grad=False)
self.Ynorm = Parameter(torch.from_numpy(output_norm), requires_grad=False)
def forward(self, x):
x = Normalize(x, self.Xnorm)
# return x, x
#Gating
g = x[:, self.gating_indices]
g = F.dropout(g, self.dropout, training=self.training)
g = self.G1(g)
g = F.elu(g)
g = F.dropout(g, self.dropout, training=self.training)
g = self.G2(g)
g = F.elu(g)
g = F.dropout(g, self.dropout, training=self.training)
g = self.G3(g)
w = F.softmax(g, dim=1)
#Main
m = x[:, self.main_indices]
m = F.dropout(m, self.dropout, training=self.training)
m = self.E1(m, w)
m = F.elu(m)
m = F.dropout(m, self.dropout, training=self.training)
m = self.E2(m , w)
m = F.elu(m)
m = F.dropout(m, self.dropout, training=self.training)
m = self.E3(m, w)
return Renormalize(m, self.Ynorm), w
#Output-Blended MoE Layer
class ExpertLinear(torch.nn.Module):
def __init__(self, experts, input_dim, output_dim):
super(ExpertLinear, self).__init__()
self.experts = experts
self.input_dim = input_dim
self.output_dim = output_dim
self.W = self.weights([experts, input_dim, output_dim])
self.b = self.bias([experts, 1, output_dim])
def forward(self, x, weights):
y = torch.zeros((x.shape[0], self.output_dim), device=x.device, requires_grad=True)
for i in range(self.experts):
y = y + weights[:,i].unsqueeze(1) * (x.matmul(self.W[i,:,:]) + self.b[i,:,:])
return y
def weights(self, shape):
alpha_bound = np.sqrt(6.0 / np.prod(shape[-2:]))
alpha = np.asarray(np.random.uniform(low=-alpha_bound, high=alpha_bound, size=shape), dtype=np.float32)
return Parameter(torch.from_numpy(alpha), requires_grad=True)
def bias(self, shape):
return Parameter(torch.zeros(shape, dtype=torch.float), requires_grad=True)
def unittest0():
gating_indices = torch.tensor([(441 + i) for i in range(130)])
main_indices = torch.tensor([(0 + i) for i in range(441)])
dropout = 0.3
gating_hidden = 64
main_hidden = 512
experts = 6
output_dim = 530
Xnorm = np.random.randn(2, 571).astype(np.float32)
Ynorm = np.random.randn(2, 530).astype(np.float32)
model = Model(
gating_indices=gating_indices,
gating_input=len(gating_indices),
gating_hidden=gating_hidden,
gating_output=experts,
main_indices=main_indices,
main_input=len(main_indices),
main_hidden=main_hidden,
main_output=output_dim,
dropout=dropout,
input_norm=Xnorm,
output_norm=Ynorm
)
dummy_inpt = torch.randn(1, 571).float()
out1, out2 = model(dummy_inpt)
print(out1.shape, out2.shape)
unittest0()
Thanks for the example @iGuaZi . Will take a look upon bandwidth. At the first glance, I found the matmul
operator is used. Please refer to our operator list that matmul
is currently not fully supported yet, while is under consideration of our ongoing PR. Therefore, that might cause some trouble during applying OTO onto this network.