diff --git a/experiment/main.py b/experiment/main.py index dcefad8f5..8b0ea0eaf 100644 --- a/experiment/main.py +++ b/experiment/main.py @@ -1,306 +1,12 @@ -import hashlib -import os import pathlib -import pickle import random import subprocess -import sys -from argparse import ArgumentParser, BooleanOptionalAction -from datetime import datetime -from typing import List +from argparse import ArgumentParser -import keras_tuner as kt import numpy as np -import pandas as pd -import tensorflow as tf -from keras import Model # type: ignore -from numpy import ndarray -from sklearn.metrics import f1_score, precision_score, recall_score, log_loss, accuracy_score -from sklearn.model_selection import train_test_split -from sklearn.utils import compute_class_weight -from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint - -from experiment.plot import save_plot -from experiment.src.data_loader import read_detected_data, read_metadata, join_label, get_y_labels -from experiment.src.features import prepare_data -from experiment.src.log_callback import LogCallback -from experiment.src.ml_model import MlModel -from experiment.src.model_config_preprocess import model_config_preprocess -from experiment.src.prepare_data import prepare_train_data - - -def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray], y_label: np.ndarray): - """Evaluate Keras model with printing scores - - Args: - thresholds: dict of credsweeper thresholds - keras_model: fitted keras model - x_data: List of np.arrays. Number and shape depends on model - y_label: expected result - - """ - predictions_proba = keras_model.predict(x_data, verbose=2).ravel() - for name, threshold in thresholds.items(): - predictions = (predictions_proba > threshold) - accuracy = accuracy_score(y_label, predictions) - precision = precision_score(y_label, predictions) - recall = recall_score(y_label, predictions) - loss = log_loss(y_label, predictions) - f1 = f1_score(y_label, predictions) - print(f"{name}: {threshold:0.6f}, " - f"accuracy: {accuracy:0.6f}, " - f"precision:{precision:0.6f}, " - f"recall: {recall:0.6f}, " - f"loss: {loss:0.6f}, " - f"F1:{f1:0.6f}") - - -def main( - cred_data_location: str, - jobs: int, - epochs: int, - batch_size: int, - patience: int, - doc_target: bool, - use_tuner: bool, - eval_test: bool, - eval_train: bool, - eval_full: bool, -) -> str: - print(f"Memory at start: {LogCallback.get_memory_info()}") - - current_time = datetime.now().strftime("%Y%m%d_%H%M%S") - - dir_path = pathlib.Path("results") - os.makedirs(dir_path, exist_ok=True) - - print(f"Train model on data from {cred_data_location}") - meta_checksum, data_checksum = prepare_train_data(cred_data_location, jobs, doc_target) - - df_all_file = dir_path / f"{meta_checksum}-{data_checksum}.pkl" - if df_all_file.exists(): - df_all = pd.read_pickle(df_all_file) - print(f"Read from {df_all_file}") - else: - # detected data means which data is passed to ML validator of credsweeper after filters with RuleName - detected_data = read_detected_data(f"results/detected_data.{data_checksum}.json") - print(f"CredSweeper detected {len(detected_data)} credentials without ML") - # all markup data - meta_data = read_metadata(f"{cred_data_location}/meta") - print(f"Metadata markup: {len(meta_data)} items") - df_all = join_label(detected_data, meta_data, cred_data_location) - # np.save(df_all_file, df_all) - df_all.to_pickle(df_all_file) - print(f"Stored to {df_all_file}") - # to prevent extra memory consumption - delete unnecessary objects - del detected_data - del meta_data - - # workaround for CI step - for i in range(3): - # there are 2 times possible fails due ml config was updated - try: - thresholds = model_config_preprocess(df_all, doc_target) - break - except RuntimeError as exc: - if "RESTART:" in str(exc): - continue - else: - raise - else: - raise RuntimeError("Something went wrong") - - print(f"Common dataset: {len(df_all)} items") - df_all = df_all.drop_duplicates(subset=["line", "variable", "value", "path", "ext"]) - print(f"Common dataset: {len(df_all)} items after drop duplicates") - - # random split - lucky_number = random.randint(1, 1 << 32) - print(f"Lucky number: {lucky_number}") - df_train, df_test = train_test_split(df_all, test_size=0.15, random_state=lucky_number) - len_df_train = len(df_train) - print(f"Train size: {len_df_train}") - len_df_test = len(df_test) - print(f"Test size: {len_df_test}") - - print(f"Prepare full data") - x_full_line, x_full_variable, x_full_value, x_full_features = prepare_data(df_all) - y_full: ndarray = get_y_labels(df_all) - del df_all - - print(f"Prepare train data") - x_train_line, x_train_variable, x_train_value, x_train_features = prepare_data(df_train) - print("x_train_value dtype ", x_train_value.dtype) # dbg - print("x_train_features dtype", x_train_features.dtype) # dbg - y_train = get_y_labels(df_train) - print("y_train dtype", y_train.dtype) # dbg - del df_train - - print(f"Class-1 prop on train: {np.mean(y_train):.4f}") - - classes = np.unique(y_train) - class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train) - max_weight = max(class_weights) - class_weights = [weight / max_weight for weight in class_weights] - print(f"y_train size:{len(y_train)}, 0: {np.count_nonzero(y_train == 0)}, 1: {np.count_nonzero(y_train == 1)}") - class_weight = dict(zip(classes, class_weights)) - print(f"class_weight: {class_weight}") # information about class weights - - print(f"Prepare test data") - x_test_line, x_test_variable, x_test_value, x_test_features = prepare_data(df_test) - y_test = get_y_labels(df_test) - print(f"Class-1 prop on test: {np.mean(y_test):.4f}") - del df_test - - print(f"Memory before search / compile: {LogCallback.get_memory_info()}") - - hp_dict = { - "value_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.41), - "line_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.41), - "variable_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.46), - "dense_a_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.2), - "dense_b_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.18), - } - log_callback = LogCallback() - if use_tuner: - print(f"Tuner initial dict:{hp_dict}") - tuner_kwargs = {k: v[0] for k, v in hp_dict.items()} - print(f"Tuner kwargs:{tuner_kwargs}") - - tuner = kt.BayesianOptimization( - hypermodel=MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape, - **tuner_kwargs), - objective='val_loss', - directory=str(dir_path / f"{current_time}.tuner"), - project_name='ml_tuning', - ) - search_early_stopping = EarlyStopping(monitor="val_loss", - patience=patience, - mode="min", - restore_best_weights=True, - verbose=1) - tuner.search( - x=[x_train_line, x_train_variable, x_train_value, x_train_features], - y=y_train, - epochs=epochs, - batch_size=batch_size, - callbacks=[search_early_stopping, log_callback], - validation_data=([x_test_line, x_test_variable, x_test_value, x_test_features], y_test), - verbose=2, - ) - print("Best Hyperparameters:") - for k, v in tuner.get_best_hyperparameters()[0].values.items(): - print(f"{k}: {v}") - param_kwargs = {k: float(v) for k, v in tuner.get_best_hyperparameters()[0].values.items() if k in hp_dict} - del tuner - else: - print(f"Model is trained with params from dict:{hp_dict}") - param_kwargs = {k: v[1] for k, v in hp_dict.items()} - - print(f"Model hyper parameters: {param_kwargs}") - - # repeat train step to obtain actual history chart - keras_model = MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape, - **param_kwargs).build() - if not eval_full: - # the data are not necessary - del x_full_line - del x_full_variable - del x_full_value - del x_full_features - del y_full - - early_stopping = EarlyStopping(monitor="val_loss", - patience=patience, - mode="min", - restore_best_weights=True, - verbose=1) - model_checkpoint = ModelCheckpoint(filepath=str(dir_path / f"{current_time}.best_model"), - monitor="val_loss", - save_best_only=True, - mode="min", - verbose=1) - - print(f"Memory before train: {LogCallback.get_memory_info()}") - - fit_history = keras_model.fit(x=[x_train_line, x_train_variable, x_train_value, x_train_features], - y=y_train, - batch_size=batch_size, - epochs=epochs, - verbose=2, - validation_data=([x_test_line, x_test_variable, x_test_value, - x_test_features], y_test), - class_weight=class_weight, - callbacks=[early_stopping, model_checkpoint, log_callback], - use_multiprocessing=True) - - # if best_val_loss is not None and best_val_loss + 0.00001 < early_stopping.best: - # print(f"CHECK BEST TUNER EARLY STOP : {best_val_loss} vs CURRENT: {early_stopping.best}") - - print(f"Memory after train: {LogCallback.get_memory_info()}") - - with open(dir_path / f"{current_time}.history.pickle", "wb") as f: - pickle.dump(fit_history, f) - - model_file_name = dir_path / f"ml_model_at-{current_time}" - keras_model.save(model_file_name, include_optimizer=False) - - if eval_test: - print(f"Validate results on the test subset. Size: {len(y_test)} {np.mean(y_test):.4f}") - evaluate_model(thresholds, keras_model, [x_test_line, x_test_variable, x_test_value, x_test_features], y_test) - # drop small test set first to free a bit more memory for next evaluation - del x_test_line - del x_test_variable - del x_test_value - del x_test_features - del y_test - - if eval_train: - print(f"Validate results on the train subset. Size: {len(y_train)} {np.mean(y_train):.4f}") - evaluate_model(thresholds, keras_model, [x_train_line, x_train_variable, x_train_value, x_train_features], - y_train) - del x_train_line - del x_train_variable - del x_train_value - del x_train_features - del y_train - - if eval_full: - print(f"Validate results on the full set. Size: {len(y_full)} {np.mean(y_full):.4f}") - evaluate_model(thresholds, keras_model, [x_full_line, x_full_variable, x_full_value, x_full_features], y_full) - del x_full_line - del x_full_variable - del x_full_value - del x_full_features - del y_full - - onnx_model_file = pathlib.Path(__file__).parent.parent / "credsweeper" / "ml_model" / "ml_model.onnx" - # convert the model to onnx right now - convert_args = f"{sys.executable} -m tf2onnx.convert --saved-model {model_file_name.absolute()}" \ - f" --output {str(onnx_model_file)} --verbose" - subprocess.check_call(convert_args, shell=True, cwd=pathlib.Path(__file__).parent) - with open(onnx_model_file, "rb") as f: - onnx_md5 = hashlib.md5(f.read()).hexdigest() - print(f"ml_model.onnx:{onnx_md5}") - - with open(pathlib.Path(__file__).parent.parent / "credsweeper" / "ml_model" / "ml_config.json", "rb") as f: - config_md5 = hashlib.md5(f.read()).hexdigest() - print(f"ml_config.json:{config_md5}") - - best_epoch = 1 + np.argmin(np.array(fit_history.history['val_loss'])) - - # ml history analysis - save_plot( - stamp=current_time, - title=f"batch:{batch_size} train:{len_df_train} test:{len_df_test} weights:{class_weights}", - history=fit_history, - dir_path=dir_path, - best_epoch=int(best_epoch), - info=f"ml_config.json:{config_md5} ml_model.onnx:{onnx_md5} best_epoch:{best_epoch}", - ) - - return str(model_file_name.absolute()) +import torch +from train import train if __name__ == "__main__": parser = ArgumentParser() @@ -329,59 +35,46 @@ def main( default=256, dest="batch_size", metavar="POSITIVE_INT") + parser.add_argument("--device", + help="The device(CPU or GPU) that will be used to train the model", + default="cpu", + type=str, + choices=["cpu", "cuda"], + dest="device") parser.add_argument("-p", "--patience", help="early stopping patience (default: 5)", default=5, dest="patience", metavar="POSITIVE_INT") - parser.add_argument("--doc", help="use doc target", dest="doc_target", action=BooleanOptionalAction, default=False) - parser.add_argument("--tuner", - help="use keras tuner", - dest="use_tuner", - action=BooleanOptionalAction, - default=False) - parser.add_argument("--eval-test", - help="evaluate model for test dataset", - dest="eval_test", - action=BooleanOptionalAction, - default=False) - parser.add_argument("--eval-train", - help="evaluate model for train dataset", - dest="eval_train", - action=BooleanOptionalAction, - default=False) + parser.add_argument("--doc", help="use doc target", dest="doc_target", action="store_true") + parser.add_argument("--tuner", help="use parameter tuner", dest="use_tuner", action="store_true") parser.add_argument("--eval-full", help="evaluate model for full dataset after train", dest="eval_full", - action=BooleanOptionalAction, - default=False) + action="store_true") args = parser.parse_args() - fixed_seed = 20250721 + fixed_seed = 20250124 print(f"Fixed seed:{fixed_seed}") - tf.random.set_seed(fixed_seed) np.random.seed(fixed_seed) random.seed(fixed_seed) + torch.manual_seed(fixed_seed) - # to keep the hash in log and verify command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_config.json" subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent) command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_model.onnx" subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent) - print(args) # dbg - _model_file_name = main( + _model_file_name = train( cred_data_location=args.cred_data_location, jobs=int(args.jobs), epochs=int(args.epochs), + device=str(args.device), batch_size=int(args.batch_size), patience=int(args.patience), doc_target=bool(args.doc_target), use_tuner=bool(args.use_tuner), - eval_test=bool(args.eval_test), - eval_train=bool(args.eval_train), eval_full=bool(args.eval_full), ) - # print in last line the name print(f"\nYou can find your model in:\n{_model_file_name}") diff --git a/experiment/plot.py b/experiment/plot.py index d4d622ee8..995e4d7ee 100644 --- a/experiment/plot.py +++ b/experiment/plot.py @@ -1,32 +1,43 @@ import pathlib import pickle +import math import matplotlib.pyplot as plt -from keras.src.callbacks import History from matplotlib import image as mpimg +METRICS = ["loss", "accuracy", "precision", "recall"] +GRAPHS_PER_ROW = 2 -def save_plot(stamp: str, title: str, history: History, dir_path: pathlib.Path, best_epoch: int, info: str): - plt.clf() - fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 9), tight_layout=True) +def save_plot(stamp: str, title: str, history: dict, dir_path: pathlib.Path, best_epoch: int, info: str): + plt.clf() + nrows = math.ceil(len(METRICS) / GRAPHS_PER_ROW) + ncols = GRAPHS_PER_ROW + fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16, 9), tight_layout=True) fig.suptitle(f"{stamp} {title}") - # train displays "Epoch 1/7", so let the plot starts from 1 - x = [x + 1 for x in history.epoch] - - for idx, characteristic in enumerate(["loss", "binary_accuracy", "precision", "recall"]): - axes_x = (1 & idx) - axes_y = (2 & idx) >> 1 - y_train = history.history[characteristic] - y_test = history.history[f"val_{characteristic}"] - axes[axes_x, axes_y].plot(x, y_train, label="train") - axes[axes_x, axes_y].plot(x, y_test, label="test") - axes[axes_x, axes_y].set_title(characteristic) - axes[axes_x, axes_y].legend(loc="upper left") - axes[axes_x, axes_y].grid(visible=True, which="both", color="grey", linewidth=0.75, linestyle="dotted") - axes[axes_x, axes_y].set_xticks(range(min(x), max(x) + 1, 1), minor=True) - axes[axes_x, axes_y].axvline(x=best_epoch, color='green', linestyle='--', linewidth=1) + x = list(range(1, len(history['loss']) + 1)) + + for idx, characteristic in enumerate(METRICS): + axes_x = idx % GRAPHS_PER_ROW + axes_y = idx // GRAPHS_PER_ROW + ax = axes[axes_y, axes_x] if nrows > 1 else axes[axes_x] + y_train = history[characteristic] + y_test = history[f"val_{characteristic}"] + ax.plot(x, y_train, label="train") + ax.plot(x, y_test, label="test") + ax.set_title(characteristic) + ax.legend(loc="upper left") + ax.grid(visible=True, which="both", color="grey", linewidth=0.75, linestyle="dotted") + ax.set_xticks(range(min(x), max(x) + 1, 1), minor=True) + ax.axvline(x=best_epoch, color='green', linestyle='--', linewidth=1) + + if nrows * ncols > len(METRICS): + for j in range(len(METRICS), nrows * ncols): + axes_x = j % GRAPHS_PER_ROW + axes_y = j // GRAPHS_PER_ROW + ax = axes[axes_y, axes_x] if nrows > 1 else axes[axes_x] + ax.axis('off') fig.text(0.001, 0.001, info, fontsize=10, color='green', backgroundcolor='white') plt.savefig(dir_path / f"{stamp}.png", dpi=96) diff --git a/experiment/requirements.txt b/experiment/requirements.txt index bdd2a5031..d63fc6123 100644 --- a/experiment/requirements.txt +++ b/experiment/requirements.txt @@ -2,20 +2,12 @@ # pip 24.3.1 # version sensetive -h5py==3.12.1 -keras==2.15.0 -keras-tuner==1.4.7 -numpy==1.26.4 -onnx==1.17.0 -protobuf==3.20.3 -scikit-learn==1.6.1 -tensorflow==2.15.1 -tensorrt==10.8.0.43 -tf2onnx==1.16.1 -wrapt==1.14.1 +onnx==1.19.0 +scikit-learn==1.7.0 +torch==2.8.0 +optuna==4.2.1 # version insensetive -types-tensorflow matplotlib colorama psutil diff --git a/experiment/src/ml_model.py b/experiment/src/ml_model.py index fe224800a..aa397c990 100644 --- a/experiment/src/ml_model.py +++ b/experiment/src/ml_model.py @@ -1,84 +1,82 @@ from typing import Any -import keras_tuner as kt -from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Input, Concatenate, Dropout -from tensorflow.keras.models import Model -from tensorflow.keras.optimizers import Adam -from tensorflow.python.keras.metrics import BinaryAccuracy, Precision, Recall +import torch +import torch.nn as nn +import torch.nn.functional as F from credsweeper.common.constants import ML_HUNK from credsweeper.ml_model.ml_validator import MlValidator -class MlModel(kt.HyperModel): - d_type = "float32" +class MlModel(nn.Module): - def __init__(self, line_shape: tuple, variable_shape: tuple, value_shape: tuple, feature_shape: tuple, **kwargs): - self.line_shape = line_shape - self.variable_shape = variable_shape - self.value_shape = value_shape - self.feature_shape = feature_shape - self.__kwargs = kwargs - - def __get_hyperparam(self, param_name: str, hp=None) -> Any: - if param := self.__kwargs.get(param_name): - if isinstance(param, float): - print(f"'{param_name}' constant = {param}") - return param - elif hp and isinstance(param, tuple) and 3 == len(param): - print(f"'{param_name}' tuning = {param}") - return hp.Float(param_name, min_value=param[0], max_value=param[1], step=param[2]) - else: - raise ValueError(f"'{param_name}' was not inited well {param} tuner is {bool(hp)}") - else: - raise ValueError(f"'{param_name}' was not defined during init and tuner is used") - - def build(self, hp=None) -> Model: - """Get keras model with string and feature input and single binary out""" + def __init__(self, line_shape: tuple, variable_shape: tuple, value_shape: tuple, feature_shape: tuple, hp=None): + super(MlModel, self).__init__() + if hp is None: + hp = {} value_lstm_dropout_rate = self.__get_hyperparam("value_lstm_dropout_rate", hp) line_lstm_dropout_rate = self.__get_hyperparam("line_lstm_dropout_rate", hp) variable_lstm_dropout_rate = self.__get_hyperparam("variable_lstm_dropout_rate", hp) dense_a_dropout_rate = self.__get_hyperparam("dense_a_lstm_dropout_rate", hp) dense_b_dropout_rate = self.__get_hyperparam("dense_b_lstm_dropout_rate", hp) - line_input = Input(shape=(None, self.line_shape[2]), name="line_input", dtype=self.d_type) - line_lstm = LSTM(units=self.line_shape[1], dtype=self.d_type) - line_bidirectional = Bidirectional(layer=line_lstm, name="line_bidirectional") - line_lstm_branch = Dropout(line_lstm_dropout_rate, name="line_dropout")(line_bidirectional(line_input)) - - variable_input = Input(shape=(None, self.variable_shape[2]), name="variable_input", dtype=self.d_type) - variable_lstm = LSTM(units=self.variable_shape[1], dtype=self.d_type) - variable_bidirectional = Bidirectional(layer=variable_lstm, name="variable_bidirectional") - variable_lstm_branch = Dropout(variable_lstm_dropout_rate, - name="variable_dropout")(variable_bidirectional(variable_input)) - - value_input = Input(shape=(None, self.value_shape[2]), name="value_input", dtype=self.d_type) - value_lstm = LSTM(units=self.value_shape[1], dtype=self.d_type) - value_bidirectional = Bidirectional(layer=value_lstm, name="value_bidirectional") - value_lstm_branch = Dropout(value_lstm_dropout_rate, name="value_dropout")(value_bidirectional(value_input)) - - feature_input = Input(shape=(self.feature_shape[1], ), name="feature_input", dtype=self.d_type) - - joined_features = Concatenate()([line_lstm_branch, variable_lstm_branch, value_lstm_branch, feature_input]) + self.line_lstm = nn.LSTM(input_size=line_shape[2], + hidden_size=line_shape[1], + batch_first=True, + bidirectional=True) + self.variable_lstm = nn.LSTM(input_size=variable_shape[2], + hidden_size=variable_shape[1], + batch_first=True, + bidirectional=True) + self.value_lstm = nn.LSTM(input_size=value_shape[2], + hidden_size=value_shape[1], + batch_first=True, + bidirectional=True) + + self.line_dropout = nn.Dropout(line_lstm_dropout_rate) + self.variable_dropout = nn.Dropout(variable_lstm_dropout_rate) + self.value_dropout = nn.Dropout(value_lstm_dropout_rate) + + dense_units = 2 * MlValidator.MAX_LEN + 2 * 2 * ML_HUNK + feature_shape[1] + + self.dense_a = nn.Linear(dense_units, dense_units) + self.dense_b = nn.Linear(dense_units, dense_units) + self.dense_final = nn.Linear(dense_units, 1) + + self.a_dropout = nn.Dropout(dense_a_dropout_rate) + self.b_dropout = nn.Dropout(dense_b_dropout_rate) + + @staticmethod + def __get_hyperparam(param_name: str, hyperparameters=None) -> Any: + if hyperparameters is None: + hyperparameters = {} + if param_name in hyperparameters: + param = hyperparameters[param_name] + if isinstance(param, float): + return param + else: + raise ValueError(f"Unexpected '{param_name}': {param}") + else: + raise ValueError(f"'{param_name}' was not defined during initialization of the model.") - # 3 bidirectional + features - dense_units = 2 * MlValidator.MAX_LEN + 2 * 2 * ML_HUNK + self.feature_shape[1] - # check after model compilation. Should be matched the combined size. + def forward(self, line_input: torch.Tensor, variable_input: torch.Tensor, value_input: torch.Tensor, + feature_input: torch.Tensor): + line_out, _ = self.line_lstm(line_input) + line_out = self.line_dropout(line_out[:, -1, :]) - # first hidden layer - dense_a = Dense(units=dense_units, activation='relu', name="a_dense", dtype=self.d_type)(joined_features) - dropout_dense_a = Dropout(dense_a_dropout_rate, name="a_dropout")(dense_a) + variable_out, _ = self.variable_lstm(variable_input) + variable_out = self.variable_dropout(variable_out[:, -1, :]) - # second hidden layer - dense_b = Dense(units=dense_units, activation='relu', name="b_dense", dtype=self.d_type)(dropout_dense_a) - dropout_dense_b = Dropout(dense_b_dropout_rate, name="b_dropout")(dense_b) + value_out, _ = self.value_lstm(value_input) + value_out = self.value_dropout(value_out[:, -1, :]) - dense_final = Dense(units=1, activation='sigmoid', name="prediction", dtype=self.d_type)(dropout_dense_b) + joined_features = torch.cat((line_out, variable_out, value_out, feature_input), dim=1) - metrics = [BinaryAccuracy(name="binary_accuracy"), Precision(name="precision"), Recall(name="recall")] + dense_a = F.relu(self.dense_a(joined_features)) + dense_a = self.a_dropout(dense_a) - model: Model = Model(inputs=[line_input, variable_input, value_input, feature_input], outputs=dense_final) - model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=metrics) - model.summary(line_length=120, expand_nested=True, show_trainable=True) + dense_b = F.relu(self.dense_b(dense_a)) + dense_b = self.b_dropout(dense_b) - return model + output = torch.sigmoid(self.dense_final(dense_b)) + return output diff --git a/experiment/src/prepare_data.py b/experiment/src/prepare_data.py index c51c50e3b..5f28c2111 100644 --- a/experiment/src/prepare_data.py +++ b/experiment/src/prepare_data.py @@ -20,7 +20,7 @@ def execute_scanner(dataset_location: str, result_location_str: str, jobs: int, sys.exit(error_code) -def dir_checksum(dir_path: Path) -> str: +def data_checksum(dir_path: Path) -> str: checksum = hashlib.md5(b'').digest() for root, dirs, files in os.walk(dir_path): for file in files: @@ -39,18 +39,17 @@ def prepare_train_data(cred_data_location: str, jobs: int, doc_target: bool): new_rules = [x for x in rules if x.get("use_ml") and target in x["target"]] Util.yaml_dump(new_rules, "results/train_config.yaml") - meta_checksum = dir_checksum(Path(cred_data_location) / "meta") + meta_checksum = data_checksum(Path(cred_data_location) / "meta") print(f"meta checksum {meta_checksum}") - data_checksum = dir_checksum(Path(cred_data_location) / "data") - print(f"data checksum {data_checksum}") - detected_data_filename = f"results/detected_data.{data_checksum}.json" + data_dir_checksum = data_checksum(Path(cred_data_location) / "data") + print(f"data checksum {data_dir_checksum}") + detected_data_filename = f"results/detected_data.{data_dir_checksum}.json" if not os.path.exists(detected_data_filename): print(f"Get CredSweeper results from {cred_data_location}. May take some time") execute_scanner(cred_data_location, detected_data_filename, jobs, doc_target) else: - print(f"Get cached result {data_checksum}") + print(f"Get cached result {data_dir_checksum}") print("Train data prepared!") - return meta_checksum, data_checksum diff --git a/experiment/train.py b/experiment/train.py new file mode 100644 index 000000000..e6aa3764b --- /dev/null +++ b/experiment/train.py @@ -0,0 +1,429 @@ +import hashlib +import os +import pathlib +import pickle +import random +from datetime import datetime +from typing import List, Dict + +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader, TensorDataset +from sklearn.metrics import f1_score, precision_score, recall_score, log_loss, accuracy_score +from sklearn.model_selection import train_test_split +from sklearn.utils import compute_class_weight +import optuna +from optuna.samplers import GridSampler +from optuna.pruners import NopPruner + +from experiment.plot import save_plot +from experiment.src.ml_model import MlModel +from experiment.src.data_loader import read_detected_data, read_metadata, join_label, get_y_labels +from experiment.src.features import prepare_data +from experiment.src.model_config_preprocess import model_config_preprocess +from experiment.src.prepare_data import prepare_train_data, data_checksum + +GPU_SAMPLE_LIMIT = 1024 +DEFAULT_LEARNING_RATE = 0.0005 + + +def objective(trial, train_loader: DataLoader, test_loader: DataLoader, model_inputs_size: List[tuple], + hp: Dict[str, tuple]): + best_val_loss = trial.study.user_attrs["best_val_loss"] + epochs = trial.study.user_attrs["epochs"] + device = trial.study.user_attrs["device"] + best_model_path = trial.study.user_attrs["best_model_path"] + params = {} + for param_name, ((low, high, step), _default) in hp.items(): + params[param_name] = trial.suggest_float(param_name, low, high, step=step) + + model = MlModel(*model_inputs_size, params).to(device) + optimizer = optim.Adam(model.parameters(), lr=DEFAULT_LEARNING_RATE) + criterion = nn.BCELoss() + + best_loss = float('inf') + patience_counter = 0 + + if device.type == "cuda" and GPU_SAMPLE_LIMIT < train_loader.batch_size: + accumulation_steps = (train_loader.batch_size + GPU_SAMPLE_LIMIT - 1) // GPU_SAMPLE_LIMIT + else: + accumulation_steps = 1 + + for epoch in range(epochs): + model.train() + for batch in train_loader: + x_tensors = [x.to(device) for x in batch[:-1]] + y_batch = batch[-1].to(device) + batch_size = y_batch.shape[0] + sub_batch_size = max(1, batch_size // accumulation_steps) + optimizer.zero_grad() + for i in range(accumulation_steps): + start = i * sub_batch_size + end = (i + 1) * sub_batch_size if i < accumulation_steps - 1 else batch_size + inputs_sub = [tens[start:end] for tens in x_tensors] + labels_sub = y_batch[start:end] + outputs = model(*inputs_sub).squeeze() + loss = criterion(outputs, labels_sub) / accumulation_steps + loss.backward() + optimizer.step() + + model.eval() + val_loss = 0.0 + with torch.no_grad(): + for batch in test_loader: + x_tensors = [x.to(device) for x in batch[:-1]] + y_batch = batch[-1].to(device) + batch_size = y_batch.shape[0] + sub_batch_size = max(1, batch_size // accumulation_steps) + for i in range(accumulation_steps): + start = i * sub_batch_size + end = (i + 1) * sub_batch_size if i < accumulation_steps - 1 else batch_size + inputs_sub = [tens[start:end] for tens in x_tensors] + labels_sub = y_batch[start:end] + outputs = model(*inputs_sub).squeeze() + loss = criterion(outputs, labels_sub) / accumulation_steps + val_loss += loss.item() + val_loss /= len(test_loader) + trial.report(val_loss, epoch) + if val_loss < best_loss: + best_loss = val_loss + patience_counter = 0 + if val_loss < best_val_loss: + best_val_loss = val_loss + trial.study.set_user_attr("best_val_loss", best_val_loss) + torch.save(model.state_dict(), best_model_path) + else: + patience_counter += 1 + if patience_counter >= 5: + break + if trial.should_prune(): + raise optuna.TrialPruned() + + return best_loss + + +def evaluate_model(thresholds: dict, + model: torch.nn.Module, + x_data: List[np.ndarray], + y_label: np.ndarray, + device, + batch_size=256): + model.eval() + predictions_proba = [] + dataset = TensorDataset(*[torch.tensor(x, dtype=torch.float32) for x in x_data]) + data_loader = DataLoader(dataset, batch_size=batch_size) + with torch.no_grad(): + for batch in data_loader: + x_tensors = [x.to(device) for x in batch] + batch_preds = model(*x_tensors).cpu().numpy().ravel() + predictions_proba.extend(batch_preds) + predictions_proba = np.array(predictions_proba) + for name, threshold in thresholds.items(): + predictions = (predictions_proba > threshold) + accuracy = accuracy_score(y_label, predictions) + precision = precision_score(y_label, predictions) + recall = recall_score(y_label, predictions) + loss = log_loss(y_label, predictions) + f1 = f1_score(y_label, predictions) + print(f"{name}: {threshold:0.6f}, " + f"accuracy: {accuracy:0.6f}, " + f"precision:{precision:0.6f}, " + f"recall: {recall:0.6f}, " + f"loss: {loss:0.6f}, " + f"F1:{f1:0.6f}") + + +def train(cred_data_location: str, + jobs: int, + epochs: int, + batch_size: int, + device: str, + patience: int, + doc_target: bool, + use_tuner: bool = False, + eval_full: bool = False) -> str: + if device == "cpu": + device = torch.device("cpu") + elif device == "cuda" and torch.cuda.is_available(): + device = torch.device("cuda") + else: + raise ValueError(f"Device {device} not supported or not available") + + print(f"Use device: {device}") + current_time = datetime.now().strftime("%Y%m%d_%H%M%S") + + dir_path = pathlib.Path("results") + os.makedirs(dir_path, exist_ok=True) + + print(f"Train model on data from {cred_data_location}") + prepare_train_data(cred_data_location, jobs, doc_target) + + cred_data_location_path = pathlib.Path(cred_data_location) / "data" + detected_data = read_detected_data(f"results/detected_data.{data_checksum(cred_data_location_path)}.json") + print(f"CredSweeper detected {len(detected_data)} credentials without ML") + meta_data = read_metadata(f"{cred_data_location}/meta") + print(f"Metadata markup: {len(meta_data)} items") + + df_all = join_label(detected_data, meta_data, cred_data_location) + del detected_data + del meta_data + + for i in range(3): + try: + thresholds = model_config_preprocess(df_all, doc_target) + break + except RuntimeError as exc: + if "RESTART:" in str(exc): + continue + else: + raise + else: + raise RuntimeError("Something went wrong") + + print(f"Common dataset: {len(df_all)} items") + df_all = df_all.drop_duplicates(subset=["line", "variable", "value", "path", "ext"]) + print(f"Common dataset: {len(df_all)} items after drop duplicates") + + lucky_number = random.randint(1, 1 << 32) + print(f"Lucky number: {lucky_number}") + df_train, df_test = train_test_split(df_all, test_size=0.15, random_state=lucky_number) + len_df_train = len(df_train) + print(f"Train size: {len_df_train}") + len_df_test = len(df_test) + print(f"Test size: {len_df_test}") + + if eval_full: + print("Prepare full data") + x_full_line, x_full_variable, x_full_value, x_full_features = prepare_data(df_all) + y_full = get_y_labels(df_all) + del df_all + + print("Prepare train data") + x_train_line, x_train_variable, x_train_value, x_train_features = prepare_data(df_train) + y_train = get_y_labels(df_train) + del df_train + + print(f"Class-1 prop on train: {np.mean(y_train):.4f}") + classes = np.unique(y_train) + class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train) + max_weight = max(class_weights) + class_weights = [weight / max_weight for weight in class_weights] + print(f"y_train size:{len(y_train)}, 0: {np.count_nonzero(y_train == 0)}, 1: {np.count_nonzero(y_train == 1)}") + class_weight = dict(zip(classes, class_weights)) + print(f"class_weight: {class_weight}") + + print("Prepare test data") + x_test_line, x_test_variable, x_test_value, x_test_features = prepare_data(df_test) + y_test = get_y_labels(df_test) + print(f"Class-1 prop on test: {np.mean(y_test):.4f}") + del df_test + + hp_dict = { + "value_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.41), + "line_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.3), + "variable_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.31), + "dense_a_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.45), + "dense_b_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.3), + } + history = { + "loss": [], + "val_loss": [], + "accuracy": [], + "val_accuracy": [], + "precision": [], + "val_precision": [], + "recall": [], + "val_recall": [] + } + + x_train = [x_train_line, x_train_variable, x_train_value, x_train_features] + x_test = [x_test_line, x_test_variable, x_test_value, x_test_features] + x_full = [x_full_line, x_full_variable, x_full_value, x_full_features] if eval_full else None + + print("Create pytorch train and test datasets...") + train_dataset = TensorDataset(*[torch.tensor(x, dtype=torch.float32) for x in x_train], + torch.tensor(y_train, dtype=torch.float32)) + test_dataset = TensorDataset(*[torch.tensor(x, dtype=torch.float32) for x in x_test], + torch.tensor(y_test, dtype=torch.float32)) + train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2) + test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=2) + + inputs_size = [x_train_line.shape, x_train_variable.shape, x_train_value.shape, x_train_features.shape] + + if use_tuner: + print("Start model train with optimization") + search_space = {k: list(np.arange(lo, hi + st, st)) for k, ((lo, hi, st), _d) in hp_dict.items()} + study = optuna.create_study(sampler=GridSampler(search_space), pruner=NopPruner(), direction="minimize") + study.set_user_attr("best_val_loss", float("inf")) + study.set_user_attr("epochs", epochs) + study.set_user_attr("device", device) + study.set_user_attr("best_model_path", str(dir_path / f"{current_time}.trials.best_model.pth")) + study.optimize(lambda trial: objective(trial, train_loader, test_loader, inputs_size, hp_dict), n_trials=10) + param_kwargs = study.best_params + df_trials = study.trials_dataframe() + df_trials.to_csv(dir_path / f"{current_time}_trials_df.csv", sep=';') + else: + param_kwargs = {k: d for k, (_r, d) in hp_dict.items()} + + print(f"Model will be trained using the following params:{param_kwargs}") + ml_model = MlModel(*inputs_size, param_kwargs).to(device) + optimizer = optim.Adam(ml_model.parameters(), lr=DEFAULT_LEARNING_RATE) + criterion = nn.BCELoss() + + best_loss = float('inf') + best_epoch = 1 + patience_counter = 0 + + if device.type == "cuda" and GPU_SAMPLE_LIMIT < batch_size: + accumulation_steps = (batch_size + GPU_SAMPLE_LIMIT - 1) // GPU_SAMPLE_LIMIT + else: + accumulation_steps = 1 + + for epoch in range(epochs): + ml_model.train() + running_loss, correct, total = 0.0, 0, 0 + all_preds, all_labels = [], [] + for b_idx, batch in enumerate(train_loader, start=1): + x_tensors = [x.to(device) for x in batch[:-1]] + y_batch = batch[-1].to(device) + optimizer.zero_grad() + bs = y_batch.shape[0] + sub_bs = max(1, bs // accumulation_steps) + preds_collect = [] + for i in range(accumulation_steps): + s = i * sub_bs + e = (i + 1) * sub_bs if i < accumulation_steps - 1 else bs + inputs_sub = [t[s:e] for t in x_tensors] + labels_sub = y_batch[s:e] + outputs = ml_model(*inputs_sub).squeeze() + preds_collect.append(outputs.detach()) + loss = criterion(outputs, labels_sub) / accumulation_steps + running_loss += loss.item() + loss.backward() + optimizer.step() + batch_outputs = torch.cat(preds_collect, dim=0) + correct += (batch_outputs.round() == y_batch).sum().item() + total += y_batch.numel() + all_preds.extend(batch_outputs.cpu().numpy()) + all_labels.extend(y_batch.cpu().numpy()) + iter_acc = (batch_outputs.round() == y_batch).float().mean().item() + iter_prec = precision_score(y_batch.cpu().numpy(), (batch_outputs.cpu().numpy() > 0.5), zero_division=0) + iter_rec = recall_score(y_batch.cpu().numpy(), (batch_outputs.cpu().numpy() > 0.5), zero_division=0) + print( + f"iter {epoch+1}.{b_idx} loss:{running_loss/b_idx:.4f} acc:{iter_acc:.4f} prec:{iter_prec:.4f} rec:{iter_rec:.4f}" + ) + + train_loss = running_loss / len(train_loader) + train_acc = correct / total + train_prec = precision_score(all_labels, np.array(all_preds) > 0.5, zero_division=0) + train_rec = recall_score(all_labels, np.array(all_preds) > 0.5, zero_division=0) + history["loss"].append(train_loss) + history["accuracy"].append(train_acc) + history["precision"].append(train_prec) + history["recall"].append(train_rec) + + ml_model.eval() + val_loss, correct, total = 0.0, 0, 0 + all_preds, all_labels = [], [] + with torch.no_grad(): + for batch in test_loader: + x_tensors = [x.to(device) for x in batch[:-1]] + y_batch = batch[-1].to(device) + bs = y_batch.shape[0] + sub_bs = max(1, bs // accumulation_steps) + preds_collect = [] + for i in range(accumulation_steps): + s = i * sub_bs + e = (i + 1) * sub_bs if i < accumulation_steps - 1 else bs + inputs_sub = [t[s:e] for t in x_tensors] + labels_sub = y_batch[s:e] + outputs = ml_model(*inputs_sub).squeeze() + preds_collect.append(outputs.detach()) + loss = criterion(outputs, labels_sub) / accumulation_steps + val_loss += loss.item() + val_outputs = torch.cat(preds_collect, dim=0) + correct += (val_outputs.round() == y_batch).sum().item() + total += y_batch.numel() + all_preds.extend(val_outputs.cpu().numpy()) + all_labels.extend(y_batch.cpu().numpy()) + + val_loss /= len(test_loader) + val_acc = correct / total + val_prec = precision_score(all_labels, np.array(all_preds) > 0.5, zero_division=0) + val_rec = recall_score(all_labels, np.array(all_preds) > 0.5, zero_division=0) + history["val_loss"].append(val_loss) + history["val_accuracy"].append(val_acc) + history["val_precision"].append(val_prec) + history["val_recall"].append(val_rec) + + print(f"Epoch [{epoch+1}/{epochs}]:") + print(f"\tTrain - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, Prec: {train_prec:.4f}, Rec: {train_rec:.4f}") + print(f"\tValidation - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, Prec: {val_prec:.4f}, Rec: {val_rec:.4f}") + + if val_loss < best_loss: + best_loss = val_loss + best_epoch = epoch + 1 + torch.save(ml_model.state_dict(), dir_path / f"{current_time}.best_model.pth") + patience_counter = 0 + else: + patience_counter += 1 + if patience_counter >= patience: + print("Early stopping triggered") + break + + ml_model.load_state_dict(torch.load(dir_path / f"{current_time}.best_model.pth", map_location=device)) + + print(f"Validate results on the train subset. Size: {len(y_train)} {np.mean(y_train):.4f}") + evaluate_model(thresholds, ml_model, x_train, y_train, device, batch_size) + + print(f"Validate results on the test subset. Size: {len(y_test)} {np.mean(y_test):.4f}") + evaluate_model(thresholds, ml_model, x_test, y_test, device, batch_size) + + if eval_full: + print(f"Validate results on the full set. Size: {len(y_full)} {np.mean(y_full):.4f}") + evaluate_model(thresholds, ml_model, x_full, y_full, device, batch_size) + del x_full, x_full_line, x_full_variable, x_full_value, x_full_features, y_full + + onnx_model_file = pathlib.Path(__file__).parent.parent / "credsweeper" / "ml_model" / "ml_model.onnx" + batch_idx = {0: "batch_size"} + dynamic_axes = { + "line_input": batch_idx, + "variable_input": batch_idx, + "value_input": batch_idx, + "feature_input": batch_idx, + "output": batch_idx, + } + x_tensors = tuple(torch.tensor([x[0]], dtype=torch.float32).to(device) for x in x_test) + with torch.no_grad(): + torch.onnx.export(ml_model, + x_tensors, + onnx_model_file, + input_names=list(dynamic_axes.keys())[:4], + output_names=list(dynamic_axes.keys())[4:], + dynamic_axes=dynamic_axes, + opset_version=13) + print(f"ONNX model export to {onnx_model_file}") + + with open(onnx_model_file, "rb") as f: + onnx_md5 = hashlib.md5(f.read()).hexdigest() + print(f"ml_model.onnx:{onnx_md5}") + + with open(pathlib.Path(__file__).parent.parent / "credsweeper" / "ml_model" / "ml_config.json", "rb") as f: + config_md5 = hashlib.md5(f.read()).hexdigest() + print(f"ml_config.json:{config_md5}") + + with open(dir_path / f"{current_time}.history.pickle", "wb") as f: + pickle.dump(history, f) + + save_plot( + stamp=current_time, + title=f"batch:{batch_size} train:{len_df_train} test:{len_df_test} weights:{class_weights}", + history=history, + dir_path=dir_path, + best_epoch=int(best_epoch), + info=f"ml_config.json:{config_md5} ml_model.onnx:{onnx_md5} best_epoch:{best_epoch}", + ) + + return str(onnx_model_file)