$$ Sk=f(Sk−1⋅Wrec+Xk⋅Wx) $$

import numpy as np
n_samples = 50
len_sequence = 10
X = np.zeros((n_samples,len_sequence) #X.shape = (50,10)
for row_idx in range(n_samples): #initializing X with random ints
X[row_idx,:]=np.around(np.random.rand(len_sequence)).astype(int)
target = np.sum(X, axis=1) #target.shape = (50,)
def update_state(xk, sk, wx, wRec):
return xk * wx + sk * wRec
def forward_state(X, wx, wRec):
S = np.zeros((X.shape[0], X.shape[1]+1))
for i in range(0, X.shape[1]):
S[:,i+1] = update_state(X[:,i], S[:,i], wx, wRec)
return S
def loss(y,t): # MSE as loss function
return np.mean((t - y)**2)

def output_gradient(y, t): #gradient of loss wrt t
return -2 * (t - y)
def backward_gradient(X, S, grad_out, WRec):
grad_over_time = np.zeros((X.shape[0], X.shape[1]+1))
grad_over_time[:, -1] = grad_out
wx_grad = 0
wRec_grad = 0
for i in range(X.shape[0],0,-1):
wx_grad+=np.mean(grad_over_time[:,i]*X[:,i-1])
wRec_grad+=np.mean(grad_over_time[:,i]*S[:,i-1])
grad_over_time[:,i-1]=grad_over_time[:,i]*wRec
return (wx_grad,wRec_grad), grad_over_time
params = [1.2, 1.2] # [wx, wRec]
eps = 1e-7
S = forward_state(X, params[0], params[1])
grad_out = output_gradient(S[:-1], target) # gradient of loss wrt Sn
backprop_grads, grad_over_time = backward_gradient(X,S,grad_out,params[1])
for p_idx, _ in enumerate(params):
grad_backprop = backprop_grads[p_idx]
params[p_idx] += eps
plus_loss = loss(forward_states(X, params[0], params[1])[:,-1], t)
params[p_idx] -= 2 * eps
min_loss = loss(forward_states(X, params[0], params[1])[:,-1], t)
params[p_idx] += eps
grad_num = (plus_loss - min_loss) / (2*eps)
if not np.isclose(grad_num, grad_backprop):
raise ValueError((
f'Numerical gradient of {grad_num:.6f} is not close to '
f'the backpropagation gradient of {grad_backprop:.6f}!'))
print('No gradient errors found')