diff --git a/experiment/main.py b/experiment/main.py
index dcefad8f5..8b0ea0eaf 100644
--- a/experiment/main.py
+++ b/experiment/main.py
@@ -1,306 +1,12 @@
-import hashlib
-import os
 import pathlib
-import pickle
 import random
 import subprocess
-import sys
-from argparse import ArgumentParser, BooleanOptionalAction
-from datetime import datetime
-from typing import List
+from argparse import ArgumentParser
 
-import keras_tuner as kt
 import numpy as np
-import pandas as pd
-import tensorflow as tf
-from keras import Model  # type: ignore
-from numpy import ndarray
-from sklearn.metrics import f1_score, precision_score, recall_score, log_loss, accuracy_score
-from sklearn.model_selection import train_test_split
-from sklearn.utils import compute_class_weight
-from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
-
-from experiment.plot import save_plot
-from experiment.src.data_loader import read_detected_data, read_metadata, join_label, get_y_labels
-from experiment.src.features import prepare_data
-from experiment.src.log_callback import LogCallback
-from experiment.src.ml_model import MlModel
-from experiment.src.model_config_preprocess import model_config_preprocess
-from experiment.src.prepare_data import prepare_train_data
-
-
-def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray], y_label: np.ndarray):
-    """Evaluate Keras model with printing scores
-
-    Args:
-        thresholds: dict of credsweeper thresholds
-        keras_model: fitted keras model
-        x_data: List of np.arrays. Number and shape depends on model
-        y_label: expected result
-
-    """
-    predictions_proba = keras_model.predict(x_data, verbose=2).ravel()
-    for name, threshold in thresholds.items():
-        predictions = (predictions_proba > threshold)
-        accuracy = accuracy_score(y_label, predictions)
-        precision = precision_score(y_label, predictions)
-        recall = recall_score(y_label, predictions)
-        loss = log_loss(y_label, predictions)
-        f1 = f1_score(y_label, predictions)
-        print(f"{name}: {threshold:0.6f}, "
-              f"accuracy: {accuracy:0.6f}, "
-              f"precision:{precision:0.6f}, "
-              f"recall: {recall:0.6f}, "
-              f"loss: {loss:0.6f}, "
-              f"F1:{f1:0.6f}")
-
-
-def main(
-    cred_data_location: str,
-    jobs: int,
-    epochs: int,
-    batch_size: int,
-    patience: int,
-    doc_target: bool,
-    use_tuner: bool,
-    eval_test: bool,
-    eval_train: bool,
-    eval_full: bool,
-) -> str:
-    print(f"Memory at start: {LogCallback.get_memory_info()}")
-
-    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
-
-    dir_path = pathlib.Path("results")
-    os.makedirs(dir_path, exist_ok=True)
-
-    print(f"Train model on data from {cred_data_location}")
-    meta_checksum, data_checksum = prepare_train_data(cred_data_location, jobs, doc_target)
-
-    df_all_file = dir_path / f"{meta_checksum}-{data_checksum}.pkl"
-    if df_all_file.exists():
-        df_all = pd.read_pickle(df_all_file)
-        print(f"Read from {df_all_file}")
-    else:
-        # detected data means which data is passed to ML validator of credsweeper after filters with RuleName
-        detected_data = read_detected_data(f"results/detected_data.{data_checksum}.json")
-        print(f"CredSweeper detected {len(detected_data)} credentials without ML")
-        # all markup data
-        meta_data = read_metadata(f"{cred_data_location}/meta")
-        print(f"Metadata markup: {len(meta_data)} items")
-        df_all = join_label(detected_data, meta_data, cred_data_location)
-        # np.save(df_all_file, df_all)
-        df_all.to_pickle(df_all_file)
-        print(f"Stored to {df_all_file}")
-        # to prevent extra memory consumption - delete unnecessary objects
-        del detected_data
-        del meta_data
-
-    # workaround for CI step
-    for i in range(3):
-        # there are 2 times possible fails due ml config was updated
-        try:
-            thresholds = model_config_preprocess(df_all, doc_target)
-            break
-        except RuntimeError as exc:
-            if "RESTART:" in str(exc):
-                continue
-            else:
-                raise
-    else:
-        raise RuntimeError("Something went wrong")
-
-    print(f"Common dataset: {len(df_all)} items")
-    df_all = df_all.drop_duplicates(subset=["line", "variable", "value", "path", "ext"])
-    print(f"Common dataset: {len(df_all)} items after drop duplicates")
-
-    # random split
-    lucky_number = random.randint(1, 1 << 32)
-    print(f"Lucky number: {lucky_number}")
-    df_train, df_test = train_test_split(df_all, test_size=0.15, random_state=lucky_number)
-    len_df_train = len(df_train)
-    print(f"Train size: {len_df_train}")
-    len_df_test = len(df_test)
-    print(f"Test size: {len_df_test}")
-
-    print(f"Prepare full data")
-    x_full_line, x_full_variable, x_full_value, x_full_features = prepare_data(df_all)
-    y_full: ndarray = get_y_labels(df_all)
-    del df_all
-
-    print(f"Prepare train data")
-    x_train_line, x_train_variable, x_train_value, x_train_features = prepare_data(df_train)
-    print("x_train_value dtype ", x_train_value.dtype)  # dbg
-    print("x_train_features dtype", x_train_features.dtype)  # dbg
-    y_train = get_y_labels(df_train)
-    print("y_train dtype", y_train.dtype)  # dbg
-    del df_train
-
-    print(f"Class-1 prop on train: {np.mean(y_train):.4f}")
-
-    classes = np.unique(y_train)
-    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
-    max_weight = max(class_weights)
-    class_weights = [weight / max_weight for weight in class_weights]
-    print(f"y_train size:{len(y_train)}, 0: {np.count_nonzero(y_train == 0)}, 1: {np.count_nonzero(y_train == 1)}")
-    class_weight = dict(zip(classes, class_weights))
-    print(f"class_weight: {class_weight}")  # information about class weights
-
-    print(f"Prepare test data")
-    x_test_line, x_test_variable, x_test_value, x_test_features = prepare_data(df_test)
-    y_test = get_y_labels(df_test)
-    print(f"Class-1 prop on test: {np.mean(y_test):.4f}")
-    del df_test
-
-    print(f"Memory before search / compile: {LogCallback.get_memory_info()}")
-
-    hp_dict = {
-        "value_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.41),
-        "line_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.41),
-        "variable_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.46),
-        "dense_a_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.2),
-        "dense_b_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.18),
-    }
-    log_callback = LogCallback()
-    if use_tuner:
-        print(f"Tuner initial dict:{hp_dict}")
-        tuner_kwargs = {k: v[0] for k, v in hp_dict.items()}
-        print(f"Tuner kwargs:{tuner_kwargs}")
-
-        tuner = kt.BayesianOptimization(
-            hypermodel=MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape,
-                               **tuner_kwargs),
-            objective='val_loss',
-            directory=str(dir_path / f"{current_time}.tuner"),
-            project_name='ml_tuning',
-        )
-        search_early_stopping = EarlyStopping(monitor="val_loss",
-                                              patience=patience,
-                                              mode="min",
-                                              restore_best_weights=True,
-                                              verbose=1)
-        tuner.search(
-            x=[x_train_line, x_train_variable, x_train_value, x_train_features],
-            y=y_train,
-            epochs=epochs,
-            batch_size=batch_size,
-            callbacks=[search_early_stopping, log_callback],
-            validation_data=([x_test_line, x_test_variable, x_test_value, x_test_features], y_test),
-            verbose=2,
-        )
-        print("Best Hyperparameters:")
-        for k, v in tuner.get_best_hyperparameters()[0].values.items():
-            print(f"{k}: {v}")
-        param_kwargs = {k: float(v) for k, v in tuner.get_best_hyperparameters()[0].values.items() if k in hp_dict}
-        del tuner
-    else:
-        print(f"Model is trained with params from dict:{hp_dict}")
-        param_kwargs = {k: v[1] for k, v in hp_dict.items()}
-
-    print(f"Model hyper parameters: {param_kwargs}")
-
-    # repeat train step to obtain actual history chart
-    keras_model = MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape,
-                          **param_kwargs).build()
-    if not eval_full:
-        # the data are not necessary
-        del x_full_line
-        del x_full_variable
-        del x_full_value
-        del x_full_features
-        del y_full
-
-    early_stopping = EarlyStopping(monitor="val_loss",
-                                   patience=patience,
-                                   mode="min",
-                                   restore_best_weights=True,
-                                   verbose=1)
-    model_checkpoint = ModelCheckpoint(filepath=str(dir_path / f"{current_time}.best_model"),
-                                       monitor="val_loss",
-                                       save_best_only=True,
-                                       mode="min",
-                                       verbose=1)
-
-    print(f"Memory before train: {LogCallback.get_memory_info()}")
-
-    fit_history = keras_model.fit(x=[x_train_line, x_train_variable, x_train_value, x_train_features],
-                                  y=y_train,
-                                  batch_size=batch_size,
-                                  epochs=epochs,
-                                  verbose=2,
-                                  validation_data=([x_test_line, x_test_variable, x_test_value,
-                                                    x_test_features], y_test),
-                                  class_weight=class_weight,
-                                  callbacks=[early_stopping, model_checkpoint, log_callback],
-                                  use_multiprocessing=True)
-
-    # if best_val_loss is not None and best_val_loss + 0.00001 < early_stopping.best:
-    #     print(f"CHECK BEST TUNER EARLY STOP : {best_val_loss} vs CURRENT: {early_stopping.best}")
-
-    print(f"Memory after train: {LogCallback.get_memory_info()}")
-
-    with open(dir_path / f"{current_time}.history.pickle", "wb") as f:
-        pickle.dump(fit_history, f)
-
-    model_file_name = dir_path / f"ml_model_at-{current_time}"
-    keras_model.save(model_file_name, include_optimizer=False)
-
-    if eval_test:
-        print(f"Validate results on the test subset. Size: {len(y_test)} {np.mean(y_test):.4f}")
-        evaluate_model(thresholds, keras_model, [x_test_line, x_test_variable, x_test_value, x_test_features], y_test)
-    # drop small test set first to free a bit more memory for next evaluation
-    del x_test_line
-    del x_test_variable
-    del x_test_value
-    del x_test_features
-    del y_test
-
-    if eval_train:
-        print(f"Validate results on the train subset. Size: {len(y_train)} {np.mean(y_train):.4f}")
-        evaluate_model(thresholds, keras_model, [x_train_line, x_train_variable, x_train_value, x_train_features],
-                       y_train)
-    del x_train_line
-    del x_train_variable
-    del x_train_value
-    del x_train_features
-    del y_train
-
-    if eval_full:
-        print(f"Validate results on the full set. Size: {len(y_full)} {np.mean(y_full):.4f}")
-        evaluate_model(thresholds, keras_model, [x_full_line, x_full_variable, x_full_value, x_full_features], y_full)
-        del x_full_line
-        del x_full_variable
-        del x_full_value
-        del x_full_features
-        del y_full
-
-    onnx_model_file = pathlib.Path(__file__).parent.parent / "credsweeper" / "ml_model" / "ml_model.onnx"
-    # convert the model to onnx right now
-    convert_args = f"{sys.executable} -m tf2onnx.convert --saved-model {model_file_name.absolute()}" \
-                   f" --output {str(onnx_model_file)} --verbose"
-    subprocess.check_call(convert_args, shell=True, cwd=pathlib.Path(__file__).parent)
-    with open(onnx_model_file, "rb") as f:
-        onnx_md5 = hashlib.md5(f.read()).hexdigest()
-        print(f"ml_model.onnx:{onnx_md5}")
-
-    with open(pathlib.Path(__file__).parent.parent / "credsweeper" / "ml_model" / "ml_config.json", "rb") as f:
-        config_md5 = hashlib.md5(f.read()).hexdigest()
-        print(f"ml_config.json:{config_md5}")
-
-    best_epoch = 1 + np.argmin(np.array(fit_history.history['val_loss']))
-
-    # ml history analysis
-    save_plot(
-        stamp=current_time,
-        title=f"batch:{batch_size} train:{len_df_train} test:{len_df_test} weights:{class_weights}",
-        history=fit_history,
-        dir_path=dir_path,
-        best_epoch=int(best_epoch),
-        info=f"ml_config.json:{config_md5} ml_model.onnx:{onnx_md5} best_epoch:{best_epoch}",
-    )
-
-    return str(model_file_name.absolute())
+import torch
 
+from train import train
 
 if __name__ == "__main__":
     parser = ArgumentParser()
@@ -329,59 +35,46 @@ def main(
                         default=256,
                         dest="batch_size",
                         metavar="POSITIVE_INT")
+    parser.add_argument("--device",
+                        help="The device(CPU or GPU) that will be used to train the model",
+                        default="cpu",
+                        type=str,
+                        choices=["cpu", "cuda"],
+                        dest="device")
     parser.add_argument("-p",
                         "--patience",
                         help="early stopping patience (default: 5)",
                         default=5,
                         dest="patience",
                         metavar="POSITIVE_INT")
-    parser.add_argument("--doc", help="use doc target", dest="doc_target", action=BooleanOptionalAction, default=False)
-    parser.add_argument("--tuner",
-                        help="use keras tuner",
-                        dest="use_tuner",
-                        action=BooleanOptionalAction,
-                        default=False)
-    parser.add_argument("--eval-test",
-                        help="evaluate model for test dataset",
-                        dest="eval_test",
-                        action=BooleanOptionalAction,
-                        default=False)
-    parser.add_argument("--eval-train",
-                        help="evaluate model for train dataset",
-                        dest="eval_train",
-                        action=BooleanOptionalAction,
-                        default=False)
+    parser.add_argument("--doc", help="use doc target", dest="doc_target", action="store_true")
+    parser.add_argument("--tuner", help="use parameter tuner", dest="use_tuner", action="store_true")
     parser.add_argument("--eval-full",
                         help="evaluate model for full dataset after train",
                         dest="eval_full",
-                        action=BooleanOptionalAction,
-                        default=False)
+                        action="store_true")
     args = parser.parse_args()
 
-    fixed_seed = 20250721
+    fixed_seed = 20250124
     print(f"Fixed seed:{fixed_seed}")
-    tf.random.set_seed(fixed_seed)
     np.random.seed(fixed_seed)
     random.seed(fixed_seed)
+    torch.manual_seed(fixed_seed)
 
-    # to keep the hash in log and verify
     command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_config.json"
     subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
     command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_model.onnx"
     subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
 
-    print(args)  # dbg
-    _model_file_name = main(
+    _model_file_name = train(
         cred_data_location=args.cred_data_location,
         jobs=int(args.jobs),
         epochs=int(args.epochs),
+        device=str(args.device),
         batch_size=int(args.batch_size),
         patience=int(args.patience),
         doc_target=bool(args.doc_target),
         use_tuner=bool(args.use_tuner),
-        eval_test=bool(args.eval_test),
-        eval_train=bool(args.eval_train),
         eval_full=bool(args.eval_full),
     )
-    # print in last line the name
     print(f"\nYou can find your model in:\n{_model_file_name}")
diff --git a/experiment/plot.py b/experiment/plot.py
index d4d622ee8..995e4d7ee 100644
--- a/experiment/plot.py
+++ b/experiment/plot.py
@@ -1,32 +1,43 @@
 import pathlib
 import pickle
+import math
 
 import matplotlib.pyplot as plt
-from keras.src.callbacks import History
 from matplotlib import image as mpimg
 
+METRICS = ["loss", "accuracy", "precision", "recall"]
+GRAPHS_PER_ROW = 2
 
-def save_plot(stamp: str, title: str, history: History, dir_path: pathlib.Path, best_epoch: int, info: str):
-    plt.clf()
-    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 9), tight_layout=True)
 
+def save_plot(stamp: str, title: str, history: dict, dir_path: pathlib.Path, best_epoch: int, info: str):
+    plt.clf()
+    nrows = math.ceil(len(METRICS) / GRAPHS_PER_ROW)
+    ncols = GRAPHS_PER_ROW
+    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16, 9), tight_layout=True)
     fig.suptitle(f"{stamp} {title}")
 
-    # train displays "Epoch 1/7", so let the plot starts from 1
-    x = [x + 1 for x in history.epoch]
-
-    for idx, characteristic in enumerate(["loss", "binary_accuracy", "precision", "recall"]):
-        axes_x = (1 & idx)
-        axes_y = (2 & idx) >> 1
-        y_train = history.history[characteristic]
-        y_test = history.history[f"val_{characteristic}"]
-        axes[axes_x, axes_y].plot(x, y_train, label="train")
-        axes[axes_x, axes_y].plot(x, y_test, label="test")
-        axes[axes_x, axes_y].set_title(characteristic)
-        axes[axes_x, axes_y].legend(loc="upper left")
-        axes[axes_x, axes_y].grid(visible=True, which="both", color="grey", linewidth=0.75, linestyle="dotted")
-        axes[axes_x, axes_y].set_xticks(range(min(x), max(x) + 1, 1), minor=True)
-        axes[axes_x, axes_y].axvline(x=best_epoch, color='green', linestyle='--', linewidth=1)
+    x = list(range(1, len(history['loss']) + 1))
+
+    for idx, characteristic in enumerate(METRICS):
+        axes_x = idx % GRAPHS_PER_ROW
+        axes_y = idx // GRAPHS_PER_ROW
+        ax = axes[axes_y, axes_x] if nrows > 1 else axes[axes_x]
+        y_train = history[characteristic]
+        y_test = history[f"val_{characteristic}"]
+        ax.plot(x, y_train, label="train")
+        ax.plot(x, y_test, label="test")
+        ax.set_title(characteristic)
+        ax.legend(loc="upper left")
+        ax.grid(visible=True, which="both", color="grey", linewidth=0.75, linestyle="dotted")
+        ax.set_xticks(range(min(x), max(x) + 1, 1), minor=True)
+        ax.axvline(x=best_epoch, color='green', linestyle='--', linewidth=1)
+
+    if nrows * ncols > len(METRICS):
+        for j in range(len(METRICS), nrows * ncols):
+            axes_x = j % GRAPHS_PER_ROW
+            axes_y = j // GRAPHS_PER_ROW
+            ax = axes[axes_y, axes_x] if nrows > 1 else axes[axes_x]
+            ax.axis('off')
 
     fig.text(0.001, 0.001, info, fontsize=10, color='green', backgroundcolor='white')
     plt.savefig(dir_path / f"{stamp}.png", dpi=96)
diff --git a/experiment/requirements.txt b/experiment/requirements.txt
index bdd2a5031..d63fc6123 100644
--- a/experiment/requirements.txt
+++ b/experiment/requirements.txt
@@ -2,20 +2,12 @@
 # pip 24.3.1
 
 # version sensetive
-h5py==3.12.1
-keras==2.15.0
-keras-tuner==1.4.7
-numpy==1.26.4
-onnx==1.17.0
-protobuf==3.20.3
-scikit-learn==1.6.1
-tensorflow==2.15.1
-tensorrt==10.8.0.43
-tf2onnx==1.16.1
-wrapt==1.14.1
+onnx==1.19.0
+scikit-learn==1.7.0
+torch==2.8.0
+optuna==4.2.1
 
 # version insensetive
-types-tensorflow
 matplotlib
 colorama
 psutil
diff --git a/experiment/src/ml_model.py b/experiment/src/ml_model.py
index fe224800a..aa397c990 100644
--- a/experiment/src/ml_model.py
+++ b/experiment/src/ml_model.py
@@ -1,84 +1,82 @@
 from typing import Any
 
-import keras_tuner as kt
-from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Input, Concatenate, Dropout
-from tensorflow.keras.models import Model
-from tensorflow.keras.optimizers import Adam
-from tensorflow.python.keras.metrics import BinaryAccuracy, Precision, Recall
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 
 from credsweeper.common.constants import ML_HUNK
 from credsweeper.ml_model.ml_validator import MlValidator
 
 
-class MlModel(kt.HyperModel):
-    d_type = "float32"
+class MlModel(nn.Module):
 
-    def __init__(self, line_shape: tuple, variable_shape: tuple, value_shape: tuple, feature_shape: tuple, **kwargs):
-        self.line_shape = line_shape
-        self.variable_shape = variable_shape
-        self.value_shape = value_shape
-        self.feature_shape = feature_shape
-        self.__kwargs = kwargs
-
-    def __get_hyperparam(self, param_name: str, hp=None) -> Any:
-        if param := self.__kwargs.get(param_name):
-            if isinstance(param, float):
-                print(f"'{param_name}' constant = {param}")
-                return param
-            elif hp and isinstance(param, tuple) and 3 == len(param):
-                print(f"'{param_name}' tuning = {param}")
-                return hp.Float(param_name, min_value=param[0], max_value=param[1], step=param[2])
-            else:
-                raise ValueError(f"'{param_name}' was not inited well {param} tuner is {bool(hp)}")
-        else:
-            raise ValueError(f"'{param_name}' was not defined during init and tuner is used")
-
-    def build(self, hp=None) -> Model:
-        """Get keras model with string and feature input and single binary out"""
+    def __init__(self, line_shape: tuple, variable_shape: tuple, value_shape: tuple, feature_shape: tuple, hp=None):
+        super(MlModel, self).__init__()
+        if hp is None:
+            hp = {}
         value_lstm_dropout_rate = self.__get_hyperparam("value_lstm_dropout_rate", hp)
         line_lstm_dropout_rate = self.__get_hyperparam("line_lstm_dropout_rate", hp)
         variable_lstm_dropout_rate = self.__get_hyperparam("variable_lstm_dropout_rate", hp)
         dense_a_dropout_rate = self.__get_hyperparam("dense_a_lstm_dropout_rate", hp)
         dense_b_dropout_rate = self.__get_hyperparam("dense_b_lstm_dropout_rate", hp)
 
-        line_input = Input(shape=(None, self.line_shape[2]), name="line_input", dtype=self.d_type)
-        line_lstm = LSTM(units=self.line_shape[1], dtype=self.d_type)
-        line_bidirectional = Bidirectional(layer=line_lstm, name="line_bidirectional")
-        line_lstm_branch = Dropout(line_lstm_dropout_rate, name="line_dropout")(line_bidirectional(line_input))
-
-        variable_input = Input(shape=(None, self.variable_shape[2]), name="variable_input", dtype=self.d_type)
-        variable_lstm = LSTM(units=self.variable_shape[1], dtype=self.d_type)
-        variable_bidirectional = Bidirectional(layer=variable_lstm, name="variable_bidirectional")
-        variable_lstm_branch = Dropout(variable_lstm_dropout_rate,
-                                       name="variable_dropout")(variable_bidirectional(variable_input))
-
-        value_input = Input(shape=(None, self.value_shape[2]), name="value_input", dtype=self.d_type)
-        value_lstm = LSTM(units=self.value_shape[1], dtype=self.d_type)
-        value_bidirectional = Bidirectional(layer=value_lstm, name="value_bidirectional")
-        value_lstm_branch = Dropout(value_lstm_dropout_rate, name="value_dropout")(value_bidirectional(value_input))
-
-        feature_input = Input(shape=(self.feature_shape[1], ), name="feature_input", dtype=self.d_type)
-
-        joined_features = Concatenate()([line_lstm_branch, variable_lstm_branch, value_lstm_branch, feature_input])
+        self.line_lstm = nn.LSTM(input_size=line_shape[2],
+                                 hidden_size=line_shape[1],
+                                 batch_first=True,
+                                 bidirectional=True)
+        self.variable_lstm = nn.LSTM(input_size=variable_shape[2],
+                                     hidden_size=variable_shape[1],
+                                     batch_first=True,
+                                     bidirectional=True)
+        self.value_lstm = nn.LSTM(input_size=value_shape[2],
+                                  hidden_size=value_shape[1],
+                                  batch_first=True,
+                                  bidirectional=True)
+
+        self.line_dropout = nn.Dropout(line_lstm_dropout_rate)
+        self.variable_dropout = nn.Dropout(variable_lstm_dropout_rate)
+        self.value_dropout = nn.Dropout(value_lstm_dropout_rate)
+
+        dense_units = 2 * MlValidator.MAX_LEN + 2 * 2 * ML_HUNK + feature_shape[1]
+
+        self.dense_a = nn.Linear(dense_units, dense_units)
+        self.dense_b = nn.Linear(dense_units, dense_units)
+        self.dense_final = nn.Linear(dense_units, 1)
+
+        self.a_dropout = nn.Dropout(dense_a_dropout_rate)
+        self.b_dropout = nn.Dropout(dense_b_dropout_rate)
+
+    @staticmethod
+    def __get_hyperparam(param_name: str, hyperparameters=None) -> Any:
+        if hyperparameters is None:
+            hyperparameters = {}
+        if param_name in hyperparameters:
+            param = hyperparameters[param_name]
+            if isinstance(param, float):
+                return param
+            else:
+                raise ValueError(f"Unexpected '{param_name}': {param}")
+        else:
+            raise ValueError(f"'{param_name}' was not defined during initialization of the model.")
 
-        # 3 bidirectional + features
-        dense_units = 2 * MlValidator.MAX_LEN + 2 * 2 * ML_HUNK + self.feature_shape[1]
-        # check after model compilation. Should be matched the combined size.
+    def forward(self, line_input: torch.Tensor, variable_input: torch.Tensor, value_input: torch.Tensor,
+                feature_input: torch.Tensor):
+        line_out, _ = self.line_lstm(line_input)
+        line_out = self.line_dropout(line_out[:, -1, :])
 
-        # first hidden layer
-        dense_a = Dense(units=dense_units, activation='relu', name="a_dense", dtype=self.d_type)(joined_features)
-        dropout_dense_a = Dropout(dense_a_dropout_rate, name="a_dropout")(dense_a)
+        variable_out, _ = self.variable_lstm(variable_input)
+        variable_out = self.variable_dropout(variable_out[:, -1, :])
 
-        # second hidden layer
-        dense_b = Dense(units=dense_units, activation='relu', name="b_dense", dtype=self.d_type)(dropout_dense_a)
-        dropout_dense_b = Dropout(dense_b_dropout_rate, name="b_dropout")(dense_b)
+        value_out, _ = self.value_lstm(value_input)
+        value_out = self.value_dropout(value_out[:, -1, :])
 
-        dense_final = Dense(units=1, activation='sigmoid', name="prediction", dtype=self.d_type)(dropout_dense_b)
+        joined_features = torch.cat((line_out, variable_out, value_out, feature_input), dim=1)
 
-        metrics = [BinaryAccuracy(name="binary_accuracy"), Precision(name="precision"), Recall(name="recall")]
+        dense_a = F.relu(self.dense_a(joined_features))
+        dense_a = self.a_dropout(dense_a)
 
-        model: Model = Model(inputs=[line_input, variable_input, value_input, feature_input], outputs=dense_final)
-        model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=metrics)
-        model.summary(line_length=120, expand_nested=True, show_trainable=True)
+        dense_b = F.relu(self.dense_b(dense_a))
+        dense_b = self.b_dropout(dense_b)
 
-        return model
+        output = torch.sigmoid(self.dense_final(dense_b))
+        return output
diff --git a/experiment/src/prepare_data.py b/experiment/src/prepare_data.py
index c51c50e3b..5f28c2111 100644
--- a/experiment/src/prepare_data.py
+++ b/experiment/src/prepare_data.py
@@ -20,7 +20,7 @@ def execute_scanner(dataset_location: str, result_location_str: str, jobs: int,
         sys.exit(error_code)
 
 
-def dir_checksum(dir_path: Path) -> str:
+def data_checksum(dir_path: Path) -> str:
     checksum = hashlib.md5(b'').digest()
     for root, dirs, files in os.walk(dir_path):
         for file in files:
@@ -39,18 +39,17 @@ def prepare_train_data(cred_data_location: str, jobs: int, doc_target: bool):
     new_rules = [x for x in rules if x.get("use_ml") and target in x["target"]]
     Util.yaml_dump(new_rules, "results/train_config.yaml")
 
-    meta_checksum = dir_checksum(Path(cred_data_location) / "meta")
+    meta_checksum = data_checksum(Path(cred_data_location) / "meta")
     print(f"meta checksum {meta_checksum}")
 
-    data_checksum = dir_checksum(Path(cred_data_location) / "data")
-    print(f"data checksum {data_checksum}")
-    detected_data_filename = f"results/detected_data.{data_checksum}.json"
+    data_dir_checksum = data_checksum(Path(cred_data_location) / "data")
+    print(f"data checksum {data_dir_checksum}")
+    detected_data_filename = f"results/detected_data.{data_dir_checksum}.json"
 
     if not os.path.exists(detected_data_filename):
         print(f"Get CredSweeper results from {cred_data_location}. May take some time")
         execute_scanner(cred_data_location, detected_data_filename, jobs, doc_target)
     else:
-        print(f"Get cached result {data_checksum}")
+        print(f"Get cached result {data_dir_checksum}")
 
     print("Train data prepared!")
-    return meta_checksum, data_checksum
diff --git a/experiment/train.py b/experiment/train.py
new file mode 100644
index 000000000..e6aa3764b
--- /dev/null
+++ b/experiment/train.py
@@ -0,0 +1,429 @@
+import hashlib
+import os
+import pathlib
+import pickle
+import random
+from datetime import datetime
+from typing import List, Dict
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+from sklearn.metrics import f1_score, precision_score, recall_score, log_loss, accuracy_score
+from sklearn.model_selection import train_test_split
+from sklearn.utils import compute_class_weight
+import optuna
+from optuna.samplers import GridSampler
+from optuna.pruners import NopPruner
+
+from experiment.plot import save_plot
+from experiment.src.ml_model import MlModel
+from experiment.src.data_loader import read_detected_data, read_metadata, join_label, get_y_labels
+from experiment.src.features import prepare_data
+from experiment.src.model_config_preprocess import model_config_preprocess
+from experiment.src.prepare_data import prepare_train_data, data_checksum
+
+GPU_SAMPLE_LIMIT = 1024
+DEFAULT_LEARNING_RATE = 0.0005
+
+
+def objective(trial, train_loader: DataLoader, test_loader: DataLoader, model_inputs_size: List[tuple],
+              hp: Dict[str, tuple]):
+    best_val_loss = trial.study.user_attrs["best_val_loss"]
+    epochs = trial.study.user_attrs["epochs"]
+    device = trial.study.user_attrs["device"]
+    best_model_path = trial.study.user_attrs["best_model_path"]
+    params = {}
+    for param_name, ((low, high, step), _default) in hp.items():
+        params[param_name] = trial.suggest_float(param_name, low, high, step=step)
+
+    model = MlModel(*model_inputs_size, params).to(device)
+    optimizer = optim.Adam(model.parameters(), lr=DEFAULT_LEARNING_RATE)
+    criterion = nn.BCELoss()
+
+    best_loss = float('inf')
+    patience_counter = 0
+
+    if device.type == "cuda" and GPU_SAMPLE_LIMIT < train_loader.batch_size:
+        accumulation_steps = (train_loader.batch_size + GPU_SAMPLE_LIMIT - 1) // GPU_SAMPLE_LIMIT
+    else:
+        accumulation_steps = 1
+
+    for epoch in range(epochs):
+        model.train()
+        for batch in train_loader:
+            x_tensors = [x.to(device) for x in batch[:-1]]
+            y_batch = batch[-1].to(device)
+            batch_size = y_batch.shape[0]
+            sub_batch_size = max(1, batch_size // accumulation_steps)
+            optimizer.zero_grad()
+            for i in range(accumulation_steps):
+                start = i * sub_batch_size
+                end = (i + 1) * sub_batch_size if i < accumulation_steps - 1 else batch_size
+                inputs_sub = [tens[start:end] for tens in x_tensors]
+                labels_sub = y_batch[start:end]
+                outputs = model(*inputs_sub).squeeze()
+                loss = criterion(outputs, labels_sub) / accumulation_steps
+                loss.backward()
+            optimizer.step()
+
+        model.eval()
+        val_loss = 0.0
+        with torch.no_grad():
+            for batch in test_loader:
+                x_tensors = [x.to(device) for x in batch[:-1]]
+                y_batch = batch[-1].to(device)
+                batch_size = y_batch.shape[0]
+                sub_batch_size = max(1, batch_size // accumulation_steps)
+                for i in range(accumulation_steps):
+                    start = i * sub_batch_size
+                    end = (i + 1) * sub_batch_size if i < accumulation_steps - 1 else batch_size
+                    inputs_sub = [tens[start:end] for tens in x_tensors]
+                    labels_sub = y_batch[start:end]
+                    outputs = model(*inputs_sub).squeeze()
+                    loss = criterion(outputs, labels_sub) / accumulation_steps
+                    val_loss += loss.item()
+        val_loss /= len(test_loader)
+        trial.report(val_loss, epoch)
+        if val_loss < best_loss:
+            best_loss = val_loss
+            patience_counter = 0
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                trial.study.set_user_attr("best_val_loss", best_val_loss)
+                torch.save(model.state_dict(), best_model_path)
+        else:
+            patience_counter += 1
+        if patience_counter >= 5:
+            break
+        if trial.should_prune():
+            raise optuna.TrialPruned()
+
+    return best_loss
+
+
+def evaluate_model(thresholds: dict,
+                   model: torch.nn.Module,
+                   x_data: List[np.ndarray],
+                   y_label: np.ndarray,
+                   device,
+                   batch_size=256):
+    model.eval()
+    predictions_proba = []
+    dataset = TensorDataset(*[torch.tensor(x, dtype=torch.float32) for x in x_data])
+    data_loader = DataLoader(dataset, batch_size=batch_size)
+    with torch.no_grad():
+        for batch in data_loader:
+            x_tensors = [x.to(device) for x in batch]
+            batch_preds = model(*x_tensors).cpu().numpy().ravel()
+            predictions_proba.extend(batch_preds)
+    predictions_proba = np.array(predictions_proba)
+    for name, threshold in thresholds.items():
+        predictions = (predictions_proba > threshold)
+        accuracy = accuracy_score(y_label, predictions)
+        precision = precision_score(y_label, predictions)
+        recall = recall_score(y_label, predictions)
+        loss = log_loss(y_label, predictions)
+        f1 = f1_score(y_label, predictions)
+        print(f"{name}: {threshold:0.6f}, "
+              f"accuracy: {accuracy:0.6f}, "
+              f"precision:{precision:0.6f}, "
+              f"recall: {recall:0.6f}, "
+              f"loss: {loss:0.6f}, "
+              f"F1:{f1:0.6f}")
+
+
+def train(cred_data_location: str,
+          jobs: int,
+          epochs: int,
+          batch_size: int,
+          device: str,
+          patience: int,
+          doc_target: bool,
+          use_tuner: bool = False,
+          eval_full: bool = False) -> str:
+    if device == "cpu":
+        device = torch.device("cpu")
+    elif device == "cuda" and torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        raise ValueError(f"Device {device} not supported or not available")
+
+    print(f"Use device: {device}")
+    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+    dir_path = pathlib.Path("results")
+    os.makedirs(dir_path, exist_ok=True)
+
+    print(f"Train model on data from {cred_data_location}")
+    prepare_train_data(cred_data_location, jobs, doc_target)
+
+    cred_data_location_path = pathlib.Path(cred_data_location) / "data"
+    detected_data = read_detected_data(f"results/detected_data.{data_checksum(cred_data_location_path)}.json")
+    print(f"CredSweeper detected {len(detected_data)} credentials without ML")
+    meta_data = read_metadata(f"{cred_data_location}/meta")
+    print(f"Metadata markup: {len(meta_data)} items")
+
+    df_all = join_label(detected_data, meta_data, cred_data_location)
+    del detected_data
+    del meta_data
+
+    for i in range(3):
+        try:
+            thresholds = model_config_preprocess(df_all, doc_target)
+            break
+        except RuntimeError as exc:
+            if "RESTART:" in str(exc):
+                continue
+            else:
+                raise
+    else:
+        raise RuntimeError("Something went wrong")
+
+    print(f"Common dataset: {len(df_all)} items")
+    df_all = df_all.drop_duplicates(subset=["line", "variable", "value", "path", "ext"])
+    print(f"Common dataset: {len(df_all)} items after drop duplicates")
+
+    lucky_number = random.randint(1, 1 << 32)
+    print(f"Lucky number: {lucky_number}")
+    df_train, df_test = train_test_split(df_all, test_size=0.15, random_state=lucky_number)
+    len_df_train = len(df_train)
+    print(f"Train size: {len_df_train}")
+    len_df_test = len(df_test)
+    print(f"Test size: {len_df_test}")
+
+    if eval_full:
+        print("Prepare full data")
+        x_full_line, x_full_variable, x_full_value, x_full_features = prepare_data(df_all)
+        y_full = get_y_labels(df_all)
+    del df_all
+
+    print("Prepare train data")
+    x_train_line, x_train_variable, x_train_value, x_train_features = prepare_data(df_train)
+    y_train = get_y_labels(df_train)
+    del df_train
+
+    print(f"Class-1 prop on train: {np.mean(y_train):.4f}")
+    classes = np.unique(y_train)
+    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
+    max_weight = max(class_weights)
+    class_weights = [weight / max_weight for weight in class_weights]
+    print(f"y_train size:{len(y_train)}, 0: {np.count_nonzero(y_train == 0)}, 1: {np.count_nonzero(y_train == 1)}")
+    class_weight = dict(zip(classes, class_weights))
+    print(f"class_weight: {class_weight}")
+
+    print("Prepare test data")
+    x_test_line, x_test_variable, x_test_value, x_test_features = prepare_data(df_test)
+    y_test = get_y_labels(df_test)
+    print(f"Class-1 prop on test: {np.mean(y_test):.4f}")
+    del df_test
+
+    hp_dict = {
+        "value_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.41),
+        "line_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.3),
+        "variable_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.31),
+        "dense_a_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.45),
+        "dense_b_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.3),
+    }
+    history = {
+        "loss": [],
+        "val_loss": [],
+        "accuracy": [],
+        "val_accuracy": [],
+        "precision": [],
+        "val_precision": [],
+        "recall": [],
+        "val_recall": []
+    }
+
+    x_train = [x_train_line, x_train_variable, x_train_value, x_train_features]
+    x_test = [x_test_line, x_test_variable, x_test_value, x_test_features]
+    x_full = [x_full_line, x_full_variable, x_full_value, x_full_features] if eval_full else None
+
+    print("Create pytorch train and test datasets...")
+    train_dataset = TensorDataset(*[torch.tensor(x, dtype=torch.float32) for x in x_train],
+                                  torch.tensor(y_train, dtype=torch.float32))
+    test_dataset = TensorDataset(*[torch.tensor(x, dtype=torch.float32) for x in x_test],
+                                 torch.tensor(y_test, dtype=torch.float32))
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=2)
+
+    inputs_size = [x_train_line.shape, x_train_variable.shape, x_train_value.shape, x_train_features.shape]
+
+    if use_tuner:
+        print("Start model train with optimization")
+        search_space = {k: list(np.arange(lo, hi + st, st)) for k, ((lo, hi, st), _d) in hp_dict.items()}
+        study = optuna.create_study(sampler=GridSampler(search_space), pruner=NopPruner(), direction="minimize")
+        study.set_user_attr("best_val_loss", float("inf"))
+        study.set_user_attr("epochs", epochs)
+        study.set_user_attr("device", device)
+        study.set_user_attr("best_model_path", str(dir_path / f"{current_time}.trials.best_model.pth"))
+        study.optimize(lambda trial: objective(trial, train_loader, test_loader, inputs_size, hp_dict), n_trials=10)
+        param_kwargs = study.best_params
+        df_trials = study.trials_dataframe()
+        df_trials.to_csv(dir_path / f"{current_time}_trials_df.csv", sep=';')
+    else:
+        param_kwargs = {k: d for k, (_r, d) in hp_dict.items()}
+
+    print(f"Model will be trained using the following params:{param_kwargs}")
+    ml_model = MlModel(*inputs_size, param_kwargs).to(device)
+    optimizer = optim.Adam(ml_model.parameters(), lr=DEFAULT_LEARNING_RATE)
+    criterion = nn.BCELoss()
+
+    best_loss = float('inf')
+    best_epoch = 1
+    patience_counter = 0
+
+    if device.type == "cuda" and GPU_SAMPLE_LIMIT < batch_size:
+        accumulation_steps = (batch_size + GPU_SAMPLE_LIMIT - 1) // GPU_SAMPLE_LIMIT
+    else:
+        accumulation_steps = 1
+
+    for epoch in range(epochs):
+        ml_model.train()
+        running_loss, correct, total = 0.0, 0, 0
+        all_preds, all_labels = [], []
+        for b_idx, batch in enumerate(train_loader, start=1):
+            x_tensors = [x.to(device) for x in batch[:-1]]
+            y_batch = batch[-1].to(device)
+            optimizer.zero_grad()
+            bs = y_batch.shape[0]
+            sub_bs = max(1, bs // accumulation_steps)
+            preds_collect = []
+            for i in range(accumulation_steps):
+                s = i * sub_bs
+                e = (i + 1) * sub_bs if i < accumulation_steps - 1 else bs
+                inputs_sub = [t[s:e] for t in x_tensors]
+                labels_sub = y_batch[s:e]
+                outputs = ml_model(*inputs_sub).squeeze()
+                preds_collect.append(outputs.detach())
+                loss = criterion(outputs, labels_sub) / accumulation_steps
+                running_loss += loss.item()
+                loss.backward()
+            optimizer.step()
+            batch_outputs = torch.cat(preds_collect, dim=0)
+            correct += (batch_outputs.round() == y_batch).sum().item()
+            total += y_batch.numel()
+            all_preds.extend(batch_outputs.cpu().numpy())
+            all_labels.extend(y_batch.cpu().numpy())
+            iter_acc = (batch_outputs.round() == y_batch).float().mean().item()
+            iter_prec = precision_score(y_batch.cpu().numpy(), (batch_outputs.cpu().numpy() > 0.5), zero_division=0)
+            iter_rec = recall_score(y_batch.cpu().numpy(), (batch_outputs.cpu().numpy() > 0.5), zero_division=0)
+            print(
+                f"iter {epoch+1}.{b_idx} loss:{running_loss/b_idx:.4f} acc:{iter_acc:.4f} prec:{iter_prec:.4f} rec:{iter_rec:.4f}"
+            )
+
+        train_loss = running_loss / len(train_loader)
+        train_acc = correct / total
+        train_prec = precision_score(all_labels, np.array(all_preds) > 0.5, zero_division=0)
+        train_rec = recall_score(all_labels, np.array(all_preds) > 0.5, zero_division=0)
+        history["loss"].append(train_loss)
+        history["accuracy"].append(train_acc)
+        history["precision"].append(train_prec)
+        history["recall"].append(train_rec)
+
+        ml_model.eval()
+        val_loss, correct, total = 0.0, 0, 0
+        all_preds, all_labels = [], []
+        with torch.no_grad():
+            for batch in test_loader:
+                x_tensors = [x.to(device) for x in batch[:-1]]
+                y_batch = batch[-1].to(device)
+                bs = y_batch.shape[0]
+                sub_bs = max(1, bs // accumulation_steps)
+                preds_collect = []
+                for i in range(accumulation_steps):
+                    s = i * sub_bs
+                    e = (i + 1) * sub_bs if i < accumulation_steps - 1 else bs
+                    inputs_sub = [t[s:e] for t in x_tensors]
+                    labels_sub = y_batch[s:e]
+                    outputs = ml_model(*inputs_sub).squeeze()
+                    preds_collect.append(outputs.detach())
+                    loss = criterion(outputs, labels_sub) / accumulation_steps
+                    val_loss += loss.item()
+                val_outputs = torch.cat(preds_collect, dim=0)
+                correct += (val_outputs.round() == y_batch).sum().item()
+                total += y_batch.numel()
+                all_preds.extend(val_outputs.cpu().numpy())
+                all_labels.extend(y_batch.cpu().numpy())
+
+        val_loss /= len(test_loader)
+        val_acc = correct / total
+        val_prec = precision_score(all_labels, np.array(all_preds) > 0.5, zero_division=0)
+        val_rec = recall_score(all_labels, np.array(all_preds) > 0.5, zero_division=0)
+        history["val_loss"].append(val_loss)
+        history["val_accuracy"].append(val_acc)
+        history["val_precision"].append(val_prec)
+        history["val_recall"].append(val_rec)
+
+        print(f"Epoch [{epoch+1}/{epochs}]:")
+        print(f"\tTrain - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, Prec: {train_prec:.4f}, Rec: {train_rec:.4f}")
+        print(f"\tValidation - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, Prec: {val_prec:.4f}, Rec: {val_rec:.4f}")
+
+        if val_loss < best_loss:
+            best_loss = val_loss
+            best_epoch = epoch + 1
+            torch.save(ml_model.state_dict(), dir_path / f"{current_time}.best_model.pth")
+            patience_counter = 0
+        else:
+            patience_counter += 1
+            if patience_counter >= patience:
+                print("Early stopping triggered")
+                break
+
+    ml_model.load_state_dict(torch.load(dir_path / f"{current_time}.best_model.pth", map_location=device))
+
+    print(f"Validate results on the train subset. Size: {len(y_train)} {np.mean(y_train):.4f}")
+    evaluate_model(thresholds, ml_model, x_train, y_train, device, batch_size)
+
+    print(f"Validate results on the test subset. Size: {len(y_test)} {np.mean(y_test):.4f}")
+    evaluate_model(thresholds, ml_model, x_test, y_test, device, batch_size)
+
+    if eval_full:
+        print(f"Validate results on the full set. Size: {len(y_full)} {np.mean(y_full):.4f}")
+        evaluate_model(thresholds, ml_model, x_full, y_full, device, batch_size)
+        del x_full, x_full_line, x_full_variable, x_full_value, x_full_features, y_full
+
+    onnx_model_file = pathlib.Path(__file__).parent.parent / "credsweeper" / "ml_model" / "ml_model.onnx"
+    batch_idx = {0: "batch_size"}
+    dynamic_axes = {
+        "line_input": batch_idx,
+        "variable_input": batch_idx,
+        "value_input": batch_idx,
+        "feature_input": batch_idx,
+        "output": batch_idx,
+    }
+    x_tensors = tuple(torch.tensor([x[0]], dtype=torch.float32).to(device) for x in x_test)
+    with torch.no_grad():
+        torch.onnx.export(ml_model,
+                          x_tensors,
+                          onnx_model_file,
+                          input_names=list(dynamic_axes.keys())[:4],
+                          output_names=list(dynamic_axes.keys())[4:],
+                          dynamic_axes=dynamic_axes,
+                          opset_version=13)
+        print(f"ONNX model export to {onnx_model_file}")
+
+    with open(onnx_model_file, "rb") as f:
+        onnx_md5 = hashlib.md5(f.read()).hexdigest()
+        print(f"ml_model.onnx:{onnx_md5}")
+
+    with open(pathlib.Path(__file__).parent.parent / "credsweeper" / "ml_model" / "ml_config.json", "rb") as f:
+        config_md5 = hashlib.md5(f.read()).hexdigest()
+        print(f"ml_config.json:{config_md5}")
+
+    with open(dir_path / f"{current_time}.history.pickle", "wb") as f:
+        pickle.dump(history, f)
+
+    save_plot(
+        stamp=current_time,
+        title=f"batch:{batch_size} train:{len_df_train} test:{len_df_test} weights:{class_weights}",
+        history=history,
+        dir_path=dir_path,
+        best_epoch=int(best_epoch),
+        info=f"ml_config.json:{config_md5} ml_model.onnx:{onnx_md5} best_epoch:{best_epoch}",
+    )
+
+    return str(onnx_model_file)