How exactly does LSTMCell from TensorFlow operates? How exactly does LSTMCell from TensorFlow operates? python python

How exactly does LSTMCell from TensorFlow operates?


i examined this link and your code is almost perfect but you forgot to add forget_bias value(default 1.0) in this line F = vsigmoid(g3) its actualy F = vsigmoid(g3+self.forget_bias) or in your case its 1 F = vsigmoid(g3+1)

here is my imp with numpy:

import numpy as npimport tensorflow as tfnum_units = 3lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units)batch=1timesteps = 7num_input = 4X = tf.placeholder("float", [batch, timesteps, num_input])x = tf.unstack(X, timesteps, 1)outputs, states = tf.contrib.rnn.static_rnn(lstm, x, dtype=tf.float32)sess = tf.Session()init = tf.global_variables_initializer()sess.run(init)x_val = np.reshape(range(28),[batch, timesteps, num_input])res = sess.run(outputs, feed_dict = {X:x_val})for e in res:    print(e)print("\nmy imp\n")#my impldef sigmoid(x):    return 1/(1+np.exp(-x))kernel,bias=sess.run([lstm._kernel,lstm._bias])f_b_=lstm._forget_biasc,h=np.zeros([batch,num_input-1]),np.zeros([batch,num_input-1])for step in range(timesteps):    inpt=np.split(x_val,7,1)[step][0]    lstm_mtrx=np.matmul(np.concatenate([inpt,h],1),kernel)+bias    i,j,f,o=np.split(lstm_mtrx,4,1)    c=sigmoid(f+f_b_)*c+sigmoid(i)*np.tanh(j)    h=sigmoid(o)*np.tanh(c)    print(h)

output:

[[ 0.06964055 -0.06541953 -0.00682676]][[ 0.005264   -0.03234607  0.00014838]][[ 1.617855e-04 -1.316892e-02  8.596722e-06]][[ 3.9425286e-06 -5.1347450e-03  7.5078127e-08]][[ 8.7508155e-08 -1.9560163e-03  6.3853928e-10]][[ 1.8867894e-09 -7.3784427e-04  5.8551406e-12]][[ 4.0385355e-11 -2.7728223e-04  5.3957669e-14]]my imp[[ 0.06964057 -0.06541953 -0.00682676]][[ 0.005264   -0.03234607  0.00014838]][[ 1.61785520e-04 -1.31689185e-02  8.59672610e-06]][[ 3.94252745e-06 -5.13474567e-03  7.50781122e-08]][[ 8.75080644e-08 -1.95601574e-03  6.38539112e-10]][[ 1.88678843e-09 -7.37844070e-04  5.85513438e-12]][[ 4.03853841e-11 -2.77282006e-04  5.39576024e-14]]


Tensorflow uses glorot_uniform() function to initialize the lstm kernel, which samples weights from a random uniform distribution. We need to fix a value for the kernel to get reproducible results:

import tensorflow as tfimport numpy as npnp.random.seed(0)timesteps = 7num_input = 4x_val = np.random.normal(size = (1, timesteps, num_input))num_units = 3def glorot_uniform(shape):    limit = np.sqrt(6.0 / (shape[0] + shape[1]))    return np.random.uniform(low=-limit, high=limit, size=shape)kernel_init = glorot_uniform((num_input + num_units, 4 * num_units))

My implementation of the LSTMCell (well, actually it's just slightly rewritten tensorflow's code):

def sigmoid(x):    return 1. / (1 + np.exp(-x))class LSTMCell():    """Long short-term memory unit (LSTM) recurrent network cell.    """    def __init__(self, num_units, initializer=glorot_uniform,               forget_bias=1.0, activation=np.tanh):        """Initialize the parameters for an LSTM cell.        Args:          num_units: int, The number of units in the LSTM cell.          initializer: The initializer to use for the kernel matrix. Default: glorot_uniform          forget_bias: Biases of the forget gate are initialized by default to 1            in order to reduce the scale of forgetting at the beginning of            the training.           activation: Activation function of the inner states.  Default: np.tanh.        """        # Inputs must be 2-dimensional.        self._num_units = num_units        self._forget_bias = forget_bias        self._activation = activation        self._initializer = initializer    def build(self, inputs_shape):        input_depth = inputs_shape[-1]        h_depth = self._num_units        self._kernel = self._initializer(shape=(input_depth + h_depth, 4 * self._num_units))        self._bias = np.zeros(shape=(4 * self._num_units))    def call(self, inputs, state):        """Run one step of LSTM.        Args:          inputs: input numpy array, must be 2-D, `[batch, input_size]`.          state:  a tuple of numpy arrays, both `2-D`, with column sizes `c_state` and            `m_state`.        Returns:          A tuple containing:          - A `2-D, [batch, output_dim]`, numpy array representing the output of the            LSTM after reading `inputs` when previous state was `state`.            Here output_dim is equal to num_units.          - Numpy array(s) representing the new state of LSTM after reading `inputs` when            the previous state was `state`.  Same type and shape(s) as `state`.        """        num_proj = self._num_units        (c_prev, m_prev) = state        input_size = inputs.shape[-1]        # i = input_gate, j = new_input, f = forget_gate, o = output_gate        lstm_matrix = np.hstack([inputs, m_prev]).dot(self._kernel)        lstm_matrix += self._bias        i, j, f, o = np.split(lstm_matrix, indices_or_sections=4, axis=0)        # Diagonal connections        c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *               self._activation(j))        m = sigmoid(o) * self._activation(c)        new_state = (c, m)        return m, new_stateX = x_val.reshape(x_val.shape[1:])cell = LSTMCell(num_units, initializer=lambda shape: kernel_init)cell.build(X.shape)state = (np.zeros(num_units), np.zeros(num_units))for i in range(timesteps):    x = X[i,:]    output, state = cell.call(x, state)    print(output)

Produces output:

[-0.21386017 -0.08401277 -0.25431477][-0.22243588 -0.25817422 -0.1612211 ][-0.2282134  -0.14207162 -0.35017249][-0.23286737 -0.17129192 -0.2706512 ][-0.11768674 -0.20717363 -0.13339118][-0.0599215  -0.17756104 -0.2028935 ][ 0.11437953 -0.19484555  0.05371994]

While your Tensorflow code, if you replace the second line with

lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units, initializer = tf.constant_initializer(kernel_init))

returns:

[[-0.2138602  -0.08401276 -0.25431478]][[-0.22243595 -0.25817424 -0.16122109]][[-0.22821338 -0.1420716  -0.35017252]][[-0.23286738 -0.1712919  -0.27065122]][[-0.1176867  -0.2071736  -0.13339119]][[-0.05992149 -0.177561   -0.2028935 ]][[ 0.11437953 -0.19484554  0.05371996]]


Considering Linear Algebra, it's possible to exist a dimension mismatch in the matrix multiplication between I*N (red circle), affecting the output, given that n x m dot m x p will give you a n x p dimensional output.

LSTM