OiO.lk Community platform!

Oio.lk is an excellent forum for developers, providing a wide range of resources, discussions, and support for those in the developer community. Join oio.lk today to connect with like-minded professionals, share insights, and stay updated on the latest trends and technologies in the development field.
  You need to log in or register to access the solved answers to this problem.
  • You have reached the maximum number of guest views allowed
  • Please register below to remove this limitation

Loss is Nan with Tensorflow

  • Thread starter Thread starter Sacha Levatic
  • Start date Start date
S

Sacha Levatic

Guest
I've developed a TensorFlow model for an artificial intelligence project, but I'm having a problem with NaN in the loss function during training. Here's an extract from my code:

Code:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import data as dt
print("[Info] Data loaded")

import numpy as np
print("[Info] Numpy loaded")
import random
print("[Info] Random loaded")
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, LambdaCallback
print("[Info] Tensorflow loaded")
import atexit
print("[Info] Atexit loaded")
import threading
print("[Info] Threading loaded")


# Add this line after importing TensorFlow
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

# Limit CPU threads
tf.config.threading.set_intra_op_parallelism_threads(0)
tf.config.threading.set_inter_op_parallelism_threads(0)

inputs = dt.train_inputs
outputs = dt.train_outputs
val_inputs = dt.val_inputs
val_outputs = dt.val_outputs
test_inputs = dt.test_inputs
test_outputs = dt.test_outputs

# Check data for NaNs or infinite values
def check_data(data, name):
    if np.isnan(data).any() or np.isinf(data).any():
        print(f"[Error] {name} contient des NaNs ou des valeurs infinies")
    else:
        print(f"[Info] {name} est valide")

check_data(inputs, "train_inputs")
check_data(outputs, "train_outputs")
check_data(val_inputs, "val_inputs")
check_data(val_outputs, "val_outputs")
check_data(test_inputs, "test_inputs")
check_data(test_outputs, "test_outputs")

class EThAI:
    def __init__(self, save: bool=True, load: bool=dt.values["DoLoad"]):
        # Create a lock to synchronise access to the file
        self.file_access_lock = threading.Lock()
        self.save = bool(save)
        self.load = bool(load)

        self.build_model()

        if self.save:
            self.model.save("CryptoAIModel", save_format='tf')

    def build_model(self, learning_rate=0.0001):
        params = {
            'dense_units1': 256.0,
            'dense_units2': 64
        }
        # Model with optimisated hyperparameter
        self.model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(33 * 80,)),
            tf.keras.layers.Dense(params["dense_units1"], activation='sigmoid'),
            tf.keras.layers.Dense(params["dense_units2"], activation='sigmoid'),
            tf.keras.layers.Dense(3, activation='softmax')
        ])
        
        if self.load:
            self.model = tf.keras.models.load_model("CryptoAIModel")

        # Compilation of the model with the Adam optimizer and the learning rate planner
        self.model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            loss='categorical_crossentropy',
            metrics=['accuracy'])


        print(f"[Info] Model Build (learning_rate={learning_rate})")

    def train(self, epochs: int):
        print("[Info] Starting...")

        def callback(epoch, _):
            self.model.save("CryptoAIModel", save_format='tf')

        early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        lambda_callback = LambdaCallback(
            on_epoch_end=callback
        )

        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")

        if dt.values["DoBatch"]:
            # Use this list of callbacks when training the model
            self.model.fit(inputs, outputs, epochs=epochs, batch_size=dt.values["BatchSize"], verbose=1, 
                validation_data=(val_inputs, val_outputs),
                callbacks=[early_stopping, lambda_callback, tensorboard_callback])
        else:
            self.model.fit(inputs, outputs, epochs=epochs, batch_size=dt.values["BatchSize"], validation_data=(val_inputs, val_outputs), verbose=1,
                callbacks=[early_stopping, lambda_callback, tensorboard_callback])

        self.model.save("CryptoAIModel", save_format='tf')
        print("[Info] Finished !!!")


    def predict(self, input):
        prediction = self.model.predict(input)
        return prediction
    
    def final_output(self, output):
            values = output[0]
            rounded_values = []
            for value in values:
                if value < 0.5:
                    rounded_values.append(0)
                else:
                    rounded_values.append(1)
            return rounded_values
    
    def get_random_input_output(self, batch_size=dt.values["DoBatch"]):
        num = random.randint(0, len(inputs) - batch_size)
        input_data = np.array([inputs[num+i] for i in range(batch_size)])
        output_data = np.array([outputs[num+i] for i in range(batch_size)])
        return input_data, output_data


if __name__ == '__main__':
    AI = EThAI(save=True)
    try:
        AI.build_model(0.001)
        dt.values["DoLoad"] = True
        AI.load = True
        print("[Info] Training Soon!!!")
        AI.train(20)
        AI.build_model(0.0001)
        AI.train(20)
        AI.build_model(0.00001)
        AI.train(20)
        AI.build_model(0.000001)
        AI.train(20)
        AI.build_model(0.0000001)
        AI.train(20)
        dt.values["DoLoad"] = True
    except Exception as e:
        print(e)
        if AI.save:
            AI.model.save("CryptoAIModel", save_format='tf')

I have already checked my data for NaN or infinite values before training, but the problem persists. How can I solve this NaN problem in the loss function of my TensorFlow model? I am using TensorFlow version 2.13.0 on Python 3.8.10 with GPU.

Thanks in advance for your help!
<p>I've developed a TensorFlow model for an artificial intelligence project, but I'm having a problem with NaN in the loss function during training. Here's an extract from my code:</p>
<pre><code>import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import data as dt
print("[Info] Data loaded")

import numpy as np
print("[Info] Numpy loaded")
import random
print("[Info] Random loaded")
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, LambdaCallback
print("[Info] Tensorflow loaded")
import atexit
print("[Info] Atexit loaded")
import threading
print("[Info] Threading loaded")


# Add this line after importing TensorFlow
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

# Limit CPU threads
tf.config.threading.set_intra_op_parallelism_threads(0)
tf.config.threading.set_inter_op_parallelism_threads(0)

inputs = dt.train_inputs
outputs = dt.train_outputs
val_inputs = dt.val_inputs
val_outputs = dt.val_outputs
test_inputs = dt.test_inputs
test_outputs = dt.test_outputs

# Check data for NaNs or infinite values
def check_data(data, name):
if np.isnan(data).any() or np.isinf(data).any():
print(f"[Error] {name} contient des NaNs ou des valeurs infinies")
else:
print(f"[Info] {name} est valide")

check_data(inputs, "train_inputs")
check_data(outputs, "train_outputs")
check_data(val_inputs, "val_inputs")
check_data(val_outputs, "val_outputs")
check_data(test_inputs, "test_inputs")
check_data(test_outputs, "test_outputs")

class EThAI:
def __init__(self, save: bool=True, load: bool=dt.values["DoLoad"]):
# Create a lock to synchronise access to the file
self.file_access_lock = threading.Lock()
self.save = bool(save)
self.load = bool(load)

self.build_model()

if self.save:
self.model.save("CryptoAIModel", save_format='tf')

def build_model(self, learning_rate=0.0001):
params = {
'dense_units1': 256.0,
'dense_units2': 64
}
# Model with optimisated hyperparameter
self.model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(33 * 80,)),
tf.keras.layers.Dense(params["dense_units1"], activation='sigmoid'),
tf.keras.layers.Dense(params["dense_units2"], activation='sigmoid'),
tf.keras.layers.Dense(3, activation='softmax')
])

if self.load:
self.model = tf.keras.models.load_model("CryptoAIModel")

# Compilation of the model with the Adam optimizer and the learning rate planner
self.model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
loss='categorical_crossentropy',
metrics=['accuracy'])


print(f"[Info] Model Build (learning_rate={learning_rate})")

def train(self, epochs: int):
print("[Info] Starting...")

def callback(epoch, _):
self.model.save("CryptoAIModel", save_format='tf')

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lambda_callback = LambdaCallback(
on_epoch_end=callback
)

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")

if dt.values["DoBatch"]:
# Use this list of callbacks when training the model
self.model.fit(inputs, outputs, epochs=epochs, batch_size=dt.values["BatchSize"], verbose=1,
validation_data=(val_inputs, val_outputs),
callbacks=[early_stopping, lambda_callback, tensorboard_callback])
else:
self.model.fit(inputs, outputs, epochs=epochs, batch_size=dt.values["BatchSize"], validation_data=(val_inputs, val_outputs), verbose=1,
callbacks=[early_stopping, lambda_callback, tensorboard_callback])

self.model.save("CryptoAIModel", save_format='tf')
print("[Info] Finished !!!")


def predict(self, input):
prediction = self.model.predict(input)
return prediction

def final_output(self, output):
values = output[0]
rounded_values = []
for value in values:
if value < 0.5:
rounded_values.append(0)
else:
rounded_values.append(1)
return rounded_values

def get_random_input_output(self, batch_size=dt.values["DoBatch"]):
num = random.randint(0, len(inputs) - batch_size)
input_data = np.array([inputs[num+i] for i in range(batch_size)])
output_data = np.array([outputs[num+i] for i in range(batch_size)])
return input_data, output_data


if __name__ == '__main__':
AI = EThAI(save=True)
try:
AI.build_model(0.001)
dt.values["DoLoad"] = True
AI.load = True
print("[Info] Training Soon!!!")
AI.train(20)
AI.build_model(0.0001)
AI.train(20)
AI.build_model(0.00001)
AI.train(20)
AI.build_model(0.000001)
AI.train(20)
AI.build_model(0.0000001)
AI.train(20)
dt.values["DoLoad"] = True
except Exception as e:
print(e)
if AI.save:
AI.model.save("CryptoAIModel", save_format='tf')
</code></pre>
<p>I have already checked my data for NaN or infinite values before training, but the problem persists. How can I solve this NaN problem in the loss function of my TensorFlow model? I am using TensorFlow version 2.13.0 on Python 3.8.10 with GPU.</p>
<p>Thanks in advance for your help!</p>
 

Latest posts

Top