I'm working on a four class multi-class text classification problem. I used a pretrained masked language model and add a classification head. The model takes two text sources as input and should output the normalized probabilities.
Here is my model
# Text input 1
    def build_model():
      inputs_descr = keras.Input(shape=(seq_length_descr,), dtype=tf.int32, name = 'input_1')
      out_descr = pretrained_lm.layers[1](inputs_descr)
      out_descr = Bidirectional(LSTM(50, return_sequences=True, activation = 'tanh', dropout=0.2, recurrent_dropout=0))(out_descr)
      out_descr = tf.keras.layers.GlobalMaxPool1D()(out_descr)
    
    # Text Input 2
      inputs_tw = keras.Input(shape=(seq_length_tw,), dtype=tf.int32, name = 'input_2')
      out_tw = pretrained_lm.layers[1](inputs_tw)
      out_tw = Bidirectional(LSTM(50, return_sequences=True, activation = 'tanh', dropout=0.2, recurrent_dropout=0))(out_tw)
      out_tw = tf.keras.layers.GlobalMaxPool1D()(out_tw)
    
      last_hidden_state_conc = concatenate([out_descr, out_tw]) 
    
      out = layers.Dense(60, activation="relu")(last_hidden_state_conc)
      out = layers.Dropout(0.2)(out)
      out = layers.Dense(30, activation="relu")(out)
      out = layers.Dropout(0.2)(out)
    
    # Classification node
      output = tf.keras.layers.Dense(4, 
                                   activation = 'softmax',
                                   )(out)
    
      lm_mediano_multi = tf.keras.Model([inputs_descr, inputs_tw], output)
      return lm_mediano_multi
The problem is the output probabilities for each sample do not add up to 1, which it should given the softmax activation in the last layer.
array([[1.0000e+00, 5.9605e-08, 2.0027e-05, 7.7486e-07],
       [1.0000e+00, 7.1526e-07, 7.0095e-05, 1.6034e-05],
       [1.0000e+00, 0.0000e+00, 8.3447e-07, 1.1921e-07],
       [1.0000e+00, 0.0000e+00, 2.5034e-06, 5.9605e-08],
       [7.3975e-01, 2.6836e-03, 2.1460e-01, 4.3060e-02],
       [1.0000e+00, 4.7684e-07, 6.8307e-05, 6.3181e-06],
       [1.0000e+00, 0.0000e+00, 3.8147e-06, 7.1526e-07]])
Just in case, I also use keras.mixed_precision.set_global_policy("mixed_float16").
I haven´t found an explanation for this.
Thanks for the input!
