Upload 40 files
Browse files- datasets/adult.arff +0 -0
- datasets/bank-full.arff +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.001.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.002.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.003.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.004.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.005.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.006.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.007.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.008.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.009.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.010.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.011.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.012.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.013.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.014.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.015.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.016.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.017.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.018.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.019.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.020.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.021.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.022.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.023.png +0 -0
- figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.024.png +0 -0
- figures/adults_fooled.png +0 -0
- figures/adults_not_fooled.png +0 -0
- figures/bank_full_fooled.png +0 -0
- figures/bank_full_not_fooled.png +0 -0
- figures/data_adult_ep_10_bs_128_lr_0.001_al_0.5_dr_0.5_losses.png +0 -0
- figures/data_adult_ep_10_bs_128_lr_0.001_al_0.5_dr_0.5_pca.png +0 -0
- figures/data_bank-full_ep_10_bs_128_lr_0.001_al_0.2_dr_0.3_losses.png +0 -0
- figures/data_bank-full_ep_10_bs_128_lr_0.001_al_0.2_dr_0.3_pca.png +0 -0
- input_data/adult.arff +0 -0
- input_data/bank-full.arff +0 -0
- nt_exp.py +203 -0
- nt_gan.py +333 -0
- nt_gg.py +282 -0
- outputs/empty +1 -0
datasets/adult.arff
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/bank-full.arff
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.001.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.002.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.003.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.004.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.005.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.006.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.007.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.008.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.009.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.010.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.011.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.012.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.013.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.014.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.015.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.016.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.017.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.018.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.019.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.020.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.021.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.022.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.023.png
ADDED
|
figures/Aspose.Words.36be2542-1776-4b1c-8010-360ae82480ae.024.png
ADDED
|
figures/adults_fooled.png
ADDED
|
figures/adults_not_fooled.png
ADDED
|
figures/bank_full_fooled.png
ADDED
|
figures/bank_full_not_fooled.png
ADDED
|
figures/data_adult_ep_10_bs_128_lr_0.001_al_0.5_dr_0.5_losses.png
ADDED
|
figures/data_adult_ep_10_bs_128_lr_0.001_al_0.5_dr_0.5_pca.png
ADDED
|
figures/data_bank-full_ep_10_bs_128_lr_0.001_al_0.2_dr_0.3_losses.png
ADDED
|
figures/data_bank-full_ep_10_bs_128_lr_0.001_al_0.2_dr_0.3_pca.png
ADDED
|
input_data/adult.arff
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
input_data/bank-full.arff
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nt_exp.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from scipy.io import arff
|
| 6 |
+
from sklearn import preprocessing
|
| 7 |
+
from sklearn.model_selection import train_test_split
|
| 8 |
+
|
| 9 |
+
from nt_gan import GAN
|
| 10 |
+
from nt_gg import GG
|
| 11 |
+
|
| 12 |
+
dataset_directory = 'datasets'
|
| 13 |
+
saved_models_path = 'outputs'
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def prepare_architecture(arff_data_path):
|
| 17 |
+
"""
|
| 18 |
+
This function create the architecture of the GAN network.
|
| 19 |
+
The generator and the discriminator are created and then combined into the GAN model
|
| 20 |
+
:param arff_data_path: data path for the arff file
|
| 21 |
+
:return: a dictionary with all the relevant variables for the next stages
|
| 22 |
+
"""
|
| 23 |
+
data, meta_data = arff.loadarff(arff_data_path) # This function reads arff file into tuple of data and its meta.
|
| 24 |
+
df = pd.DataFrame(data)
|
| 25 |
+
columns = df.columns
|
| 26 |
+
transformed_data, x, x_scaled, meta_data_rev, min_max_scaler = create_scaled_data(df, meta_data)
|
| 27 |
+
|
| 28 |
+
number_of_features = len(transformed_data.columns) # Define the GAN and training parameters
|
| 29 |
+
|
| 30 |
+
return x_scaled, meta_data_rev, columns, min_max_scaler, number_of_features
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def create_scaled_data(df, meta_data):
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
:param df:
|
| 37 |
+
:param meta_data:
|
| 38 |
+
:return:
|
| 39 |
+
"""
|
| 40 |
+
meta_data_dict = {k: {a.replace(' ', ''): b + 1 for b, a in enumerate(v.values)} for k, v in
|
| 41 |
+
meta_data._attributes.items() if
|
| 42 |
+
v.type_name != 'numeric'} # Starts from one and not zero because one is for Nan values
|
| 43 |
+
meta_data_rev = {k: {b + 1: a.replace(' ', '') for b, a in enumerate(v.values)} for k, v in
|
| 44 |
+
meta_data._attributes.items() if
|
| 45 |
+
v.type_name != 'numeric'} # Starts from one and not zero because one is for Nan values
|
| 46 |
+
transformed_data = df.copy()
|
| 47 |
+
for col in df.columns:
|
| 48 |
+
if col in meta_data_dict:
|
| 49 |
+
# Sometimes the values can not be found in the meta data, so we treat these values as Nan
|
| 50 |
+
transformed_data[col] = transformed_data[col].apply(
|
| 51 |
+
lambda x: meta_data_dict[col][str(x).split('\'')[1]] if str(x).split('\'')[1] in meta_data_dict[
|
| 52 |
+
col] else 0)
|
| 53 |
+
x = transformed_data.values # returns a numpy array
|
| 54 |
+
min_max_scaler = preprocessing.MinMaxScaler()
|
| 55 |
+
x_scaled = min_max_scaler.fit_transform(x)
|
| 56 |
+
return transformed_data, x, x_scaled, meta_data_rev, min_max_scaler
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def re_scaled_data(data, columns, meta_data_rev, min_max_scaler):
|
| 60 |
+
"""
|
| 61 |
+
This function re-scaled the fake data to the original format.
|
| 62 |
+
:param data: the data we want to re scaled
|
| 63 |
+
:param columns:
|
| 64 |
+
:param meta_data_rev:
|
| 65 |
+
:return:
|
| 66 |
+
"""
|
| 67 |
+
data_inv = min_max_scaler.inverse_transform(data)
|
| 68 |
+
df = pd.DataFrame(data_inv, columns=columns)
|
| 69 |
+
transformed_data = df.copy()
|
| 70 |
+
for col in transformed_data.columns:
|
| 71 |
+
if col in meta_data_rev:
|
| 72 |
+
# Sometimes the values can not be found in the meta data, so we treat these values as Nan
|
| 73 |
+
transformed_data[col] = transformed_data[col].apply(
|
| 74 |
+
lambda x: meta_data_rev[col][int(round(x))] if int(round(x)) in meta_data_rev[
|
| 75 |
+
col] else np.nan)
|
| 76 |
+
return transformed_data
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def first_question():
|
| 80 |
+
"""
|
| 81 |
+
This function answers the first question
|
| 82 |
+
:return:
|
| 83 |
+
"""
|
| 84 |
+
to_plot_losses = True
|
| 85 |
+
results_output = os.path.join(saved_models_path, f'question_one_results.csv')
|
| 86 |
+
results = {'dataset': [], 'lr': [], 'ep': [], 'bs': [], 'alpha': [], 'dropout': [], 'gen_loss': [], 'dis_loss': [],
|
| 87 |
+
'activation': [], 'fooled_len': [], 'not_fooled_len': [], 'mean_min_distance_fooled': [],
|
| 88 |
+
'mean_min_distance_not_fooled': [], 'mean_min_distance_gap': []}
|
| 89 |
+
# w1 * (MMDF + MMDNF) - w3 * (MMDG) + w2 * (NFL/ 100)
|
| 90 |
+
# MMDG = MMDNF - MMDF
|
| 91 |
+
# data_name = ["adult", "bank-full"]
|
| 92 |
+
# learning_rate = [0.01, 0.001, 0.0001]
|
| 93 |
+
# epochs = [5, 10, 15]
|
| 94 |
+
# batch_size = [64, 128, 1024]
|
| 95 |
+
# alpha_relu = [0.2, 0.5]
|
| 96 |
+
# dropout = [0.3, 0.5]
|
| 97 |
+
data_name = ["adult"]
|
| 98 |
+
learning_rate = [0.001]
|
| 99 |
+
epochs = [10]
|
| 100 |
+
batch_size = [128]
|
| 101 |
+
alpha_relu = [0.5]
|
| 102 |
+
dropout = [0.5]
|
| 103 |
+
loss = 'binary_crossentropy'
|
| 104 |
+
activation = 'sigmoid'
|
| 105 |
+
|
| 106 |
+
for data in data_name:
|
| 107 |
+
for lr in learning_rate:
|
| 108 |
+
for ep in epochs:
|
| 109 |
+
for bs in batch_size:
|
| 110 |
+
for al in alpha_relu:
|
| 111 |
+
for dr in dropout:
|
| 112 |
+
arff_data_path = f'./datasets/{data}.arff'
|
| 113 |
+
model_name = f'data_{data}_ep_{ep}_bs_{bs}_lr_{lr}_al_{al}_dr_{dr}'
|
| 114 |
+
pca_output = os.path.join(saved_models_path, f'{model_name}_pca.png')
|
| 115 |
+
fooled_output = os.path.join(saved_models_path, f'{model_name}_fooled.csv')
|
| 116 |
+
not_fooled_output = os.path.join(saved_models_path, f'{model_name}_not_fooled.csv')
|
| 117 |
+
|
| 118 |
+
x_scaled, meta_data_rev, columns, min_max_scaler, number_of_features = prepare_architecture(
|
| 119 |
+
arff_data_path)
|
| 120 |
+
gan_obj = GAN(number_of_features=number_of_features, saved_models_path=saved_models_path,
|
| 121 |
+
learning_rate=lr, alpha_relu=al, dropout=dr,
|
| 122 |
+
loss=loss, activation=activation)
|
| 123 |
+
gen_loss, dis_loss = gan_obj.train(scaled_data=x_scaled, epochs=ep, batch_size=bs,
|
| 124 |
+
to_plot_losses=to_plot_losses, model_name=model_name)
|
| 125 |
+
dis_fooled_scaled, dis_not_fooled_scaled, mean_min_distance_fooled, mean_min_distance_not_fooled = gan_obj.test(
|
| 126 |
+
scaled_data=x_scaled, sample_num=100, pca_output=pca_output)
|
| 127 |
+
dis_fooled = re_scaled_data(data=dis_fooled_scaled, columns=columns,
|
| 128 |
+
meta_data_rev=meta_data_rev,
|
| 129 |
+
min_max_scaler=min_max_scaler)
|
| 130 |
+
dis_fooled.to_csv(fooled_output)
|
| 131 |
+
dis_not_fooled = re_scaled_data(data=dis_not_fooled_scaled, columns=columns,
|
| 132 |
+
meta_data_rev=meta_data_rev,
|
| 133 |
+
min_max_scaler=min_max_scaler)
|
| 134 |
+
dis_not_fooled.to_csv(not_fooled_output)
|
| 135 |
+
results['dataset'].append(data)
|
| 136 |
+
results['lr'].append(lr)
|
| 137 |
+
results['ep'].append(ep)
|
| 138 |
+
results['bs'].append(bs)
|
| 139 |
+
results['alpha'].append(al)
|
| 140 |
+
results['dropout'].append(dr)
|
| 141 |
+
results['gen_loss'].append(gen_loss)
|
| 142 |
+
results['dis_loss'].append(dis_loss)
|
| 143 |
+
results['activation'].append(activation)
|
| 144 |
+
results['fooled_len'].append(len(dis_fooled_scaled))
|
| 145 |
+
results['not_fooled_len'].append(len(dis_not_fooled_scaled))
|
| 146 |
+
results['mean_min_distance_fooled'].append(mean_min_distance_fooled)
|
| 147 |
+
results['mean_min_distance_not_fooled'].append(mean_min_distance_not_fooled)
|
| 148 |
+
results['mean_min_distance_gap'].append(mean_min_distance_not_fooled-mean_min_distance_fooled)
|
| 149 |
+
results_df = pd.DataFrame.from_dict(results)
|
| 150 |
+
results_df.to_csv(results_output, index=False)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def second_question():
|
| 154 |
+
|
| 155 |
+
data_name = ["adult", "bank-full"]
|
| 156 |
+
learning_rate = [0.001]
|
| 157 |
+
epochs = [10]
|
| 158 |
+
batch_size = [128]
|
| 159 |
+
alpha_relu = [0.2]
|
| 160 |
+
dropout = [0.3]
|
| 161 |
+
results = {'dataset': [], 'lr': [], 'ep': [], 'bs': [], 'alpha': [], 'dropout': [], 'gen_loss': [], 'proba_error': []}
|
| 162 |
+
combs = len(data_name) * len(learning_rate) * len(epochs) * len(batch_size) * len(alpha_relu) * len(dropout)
|
| 163 |
+
i = 1
|
| 164 |
+
for data in data_name:
|
| 165 |
+
for lr in learning_rate:
|
| 166 |
+
for ep in epochs:
|
| 167 |
+
for bs in batch_size:
|
| 168 |
+
for al in alpha_relu:
|
| 169 |
+
for dr in dropout:
|
| 170 |
+
print(f'Running combination {i}/{combs}')
|
| 171 |
+
data_path = f'./datasets/{data}.arff'
|
| 172 |
+
model_name = f'data_{data}_ep_{ep}_bs_{bs}_lr_{lr}_part2'
|
| 173 |
+
x_scaled, meta_data_rev, cols, min_max_scaler, feature_num = prepare_architecture(data_path)
|
| 174 |
+
general_generator = GG(feature_num, saved_models_path, lr, dr, al)
|
| 175 |
+
x_train, x_test, y_train, y_test = train_test_split(x_scaled[:, :-1], x_scaled[:, -1], test_size=0.1)
|
| 176 |
+
general_generator.train_gg(x_train, y_train, ep, bs, model_name, data, saved_models_path, True)
|
| 177 |
+
error = general_generator.get_error()
|
| 178 |
+
results['dataset'].append(data)
|
| 179 |
+
results['lr'].append(lr)
|
| 180 |
+
results['ep'].append(ep)
|
| 181 |
+
results['bs'].append(bs)
|
| 182 |
+
results['alpha'].append(al)
|
| 183 |
+
results['dropout'].append(dr)
|
| 184 |
+
results['gen_loss'].append(general_generator.losses['gen_loss'][-1])
|
| 185 |
+
results['proba_error'].append(error.mean())
|
| 186 |
+
i += 1
|
| 187 |
+
# Test set performance
|
| 188 |
+
general_generator.plot_discriminator_results(x_test, y_test, data, saved_models_path)
|
| 189 |
+
general_generator.plot_generator_results(data, saved_models_path)
|
| 190 |
+
|
| 191 |
+
results_output = os.path.join(saved_models_path, f'question_two_results.csv')
|
| 192 |
+
results_df = pd.DataFrame.from_dict(results)
|
| 193 |
+
# results_df.to_csv(results_output, index=False)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def main():
|
| 198 |
+
# first_question()
|
| 199 |
+
second_question()
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
if __name__ == '__main__':
|
| 203 |
+
main()
|
nt_gan.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from itertools import compress
|
| 3 |
+
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from keras.layers import Dense, Dropout, LeakyReLU
|
| 8 |
+
from keras.models import Sequential
|
| 9 |
+
from keras.optimizers import Adam
|
| 10 |
+
from numpy.random import randn
|
| 11 |
+
from sklearn.decomposition import PCA
|
| 12 |
+
from tqdm import tqdm
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class GAN(object):
|
| 16 |
+
def __init__(self, number_of_features, saved_models_path, learning_rate, alpha_relu, dropout, loss, activation):
|
| 17 |
+
"""
|
| 18 |
+
A constructor for the GAN class
|
| 19 |
+
:param number_of_features: number of features
|
| 20 |
+
:param saved_models_path: the output folder path
|
| 21 |
+
"""
|
| 22 |
+
self.saved_models_path = saved_models_path
|
| 23 |
+
self.number_of_features = number_of_features
|
| 24 |
+
|
| 25 |
+
self.generator_model = None
|
| 26 |
+
self.noise_dim = None
|
| 27 |
+
self.discriminator_model = None
|
| 28 |
+
self.learning_rate = learning_rate
|
| 29 |
+
self.gan_model = None
|
| 30 |
+
self.activation = activation
|
| 31 |
+
self.alpha_relu = alpha_relu
|
| 32 |
+
self.loss = loss
|
| 33 |
+
self.dropout = dropout
|
| 34 |
+
self.number_of_features = number_of_features
|
| 35 |
+
|
| 36 |
+
self.build_generator() # build the generator
|
| 37 |
+
self.build_discriminator() # build the discriminator
|
| 38 |
+
self.build_gan() # build the GAN
|
| 39 |
+
|
| 40 |
+
def build_generator(self):
|
| 41 |
+
"""
|
| 42 |
+
This function creates the generator model
|
| 43 |
+
:return:
|
| 44 |
+
"""
|
| 45 |
+
noise_size = int(self.number_of_features / 2)
|
| 46 |
+
self.noise_dim = (noise_size,) # size of the noise space
|
| 47 |
+
|
| 48 |
+
self.generator_model = Sequential()
|
| 49 |
+
self.generator_model.add(Dense(int(self.number_of_features * 2), input_shape=self.noise_dim))
|
| 50 |
+
self.generator_model.add(LeakyReLU(alpha=self.alpha_relu))
|
| 51 |
+
|
| 52 |
+
self.generator_model.add(Dense(int(self.number_of_features * 4)))
|
| 53 |
+
self.generator_model.add(LeakyReLU(alpha=self.alpha_relu))
|
| 54 |
+
self.generator_model.add(Dropout(self.dropout))
|
| 55 |
+
|
| 56 |
+
self.generator_model.add(Dense(int(self.number_of_features * 2)))
|
| 57 |
+
self.generator_model.add(LeakyReLU(alpha=self.alpha_relu))
|
| 58 |
+
self.generator_model.add(Dropout(self.dropout))
|
| 59 |
+
|
| 60 |
+
# Compile it
|
| 61 |
+
self.generator_model.add(Dense(self.number_of_features, activation=self.activation))
|
| 62 |
+
self.generator_model.summary()
|
| 63 |
+
|
| 64 |
+
def build_discriminator(self):
|
| 65 |
+
"""
|
| 66 |
+
Create discriminator model
|
| 67 |
+
:return:
|
| 68 |
+
"""
|
| 69 |
+
self.discriminator_model = Sequential()
|
| 70 |
+
|
| 71 |
+
self.discriminator_model.add(Dense(self.number_of_features * 2, input_shape=(self.number_of_features,)))
|
| 72 |
+
self.discriminator_model.add(LeakyReLU(alpha=self.alpha_relu))
|
| 73 |
+
|
| 74 |
+
self.discriminator_model.add(Dense(self.number_of_features * 4))
|
| 75 |
+
self.discriminator_model.add(LeakyReLU(alpha=self.alpha_relu))
|
| 76 |
+
self.discriminator_model.add(Dropout(self.dropout))
|
| 77 |
+
|
| 78 |
+
self.discriminator_model.add(Dense(self.number_of_features * 2))
|
| 79 |
+
self.discriminator_model.add(LeakyReLU(alpha=self.alpha_relu))
|
| 80 |
+
self.discriminator_model.add(Dropout(self.dropout))
|
| 81 |
+
|
| 82 |
+
# Compile it
|
| 83 |
+
self.discriminator_model.add(Dense(1, activation=self.activation))
|
| 84 |
+
optimizer = Adam(lr=self.learning_rate)
|
| 85 |
+
self.discriminator_model.compile(loss=self.loss, optimizer=optimizer)
|
| 86 |
+
self.discriminator_model.summary()
|
| 87 |
+
|
| 88 |
+
def build_gan(self):
|
| 89 |
+
"""
|
| 90 |
+
Create the GAN network
|
| 91 |
+
:return: the GAN model object
|
| 92 |
+
"""
|
| 93 |
+
self.gan_model = Sequential()
|
| 94 |
+
self.discriminator_model.trainable = False
|
| 95 |
+
|
| 96 |
+
# The following lines connect the generator and discriminator models to the GAN.
|
| 97 |
+
self.gan_model.add(self.generator_model)
|
| 98 |
+
self.gan_model.add(self.discriminator_model)
|
| 99 |
+
|
| 100 |
+
# Compile it
|
| 101 |
+
optimizer = Adam(lr=self.learning_rate)
|
| 102 |
+
self.gan_model.compile(loss=self.loss, optimizer=optimizer)
|
| 103 |
+
|
| 104 |
+
return self.gan_model
|
| 105 |
+
|
| 106 |
+
def train(self, scaled_data, epochs, batch_size, to_plot_losses, model_name):
|
| 107 |
+
"""
|
| 108 |
+
This function trains the generator and discriminator outputs
|
| 109 |
+
:param model_name:
|
| 110 |
+
:param to_plot_losses: whether or not to plot history
|
| 111 |
+
:param scaled_data: the data after min max scaling
|
| 112 |
+
:param epochs: number of epochs
|
| 113 |
+
:param batch_size: the batch size
|
| 114 |
+
:return: losses_list: returns the losses dictionary the generator or discriminator outputs
|
| 115 |
+
"""
|
| 116 |
+
dis_output, gen_output, prev_output = self.check_for_existed_output(model_name)
|
| 117 |
+
if prev_output:
|
| 118 |
+
return -1, -1
|
| 119 |
+
|
| 120 |
+
losses_output = os.path.join(self.saved_models_path, f'{model_name}_losses.png')
|
| 121 |
+
discriminator_loss = []
|
| 122 |
+
generator_loss = []
|
| 123 |
+
|
| 124 |
+
# We need to use half of the batch size for the fake data and half for the real one
|
| 125 |
+
half_batch_size = int(batch_size / 2)
|
| 126 |
+
iterations = int(len(scaled_data) / half_batch_size)
|
| 127 |
+
iterations = iterations + 1 if len(scaled_data) % batch_size != 0 else iterations
|
| 128 |
+
|
| 129 |
+
for epoch in range(1, epochs + 1): # iterates over the epochs
|
| 130 |
+
np.random.shuffle(scaled_data)
|
| 131 |
+
p_bar = tqdm(range(iterations), ascii=True)
|
| 132 |
+
for iteration in p_bar:
|
| 133 |
+
dis_loss, gen_loss = self.train_models(batch_size=batch_size, half_batch_size=half_batch_size,
|
| 134 |
+
index=iteration, scaled_data=scaled_data)
|
| 135 |
+
discriminator_loss.append(dis_loss)
|
| 136 |
+
generator_loss.append(gen_loss)
|
| 137 |
+
p_bar.set_description(
|
| 138 |
+
f"Epoch ({epoch}/{epochs}) | DISCRIMINATOR LOSS: {dis_loss:.2f} | GENERATOR LOSS: {gen_loss:.2f} |")
|
| 139 |
+
|
| 140 |
+
# Save weights for future use
|
| 141 |
+
self.discriminator_model.save_weights(dis_output)
|
| 142 |
+
self.generator_model.save_weights(gen_output)
|
| 143 |
+
|
| 144 |
+
# Plot losses
|
| 145 |
+
if to_plot_losses:
|
| 146 |
+
self.plot_losses(discriminator_loss=discriminator_loss, generator_loss=generator_loss,
|
| 147 |
+
losses_output=losses_output)
|
| 148 |
+
|
| 149 |
+
return generator_loss[-1], discriminator_loss[-1]
|
| 150 |
+
|
| 151 |
+
def check_for_existed_output(self, model_name) -> (str, str, bool):
|
| 152 |
+
"""
|
| 153 |
+
This function checks for existed output
|
| 154 |
+
:param model_name: model's name
|
| 155 |
+
:return:
|
| 156 |
+
"""
|
| 157 |
+
prev_output = False
|
| 158 |
+
dis_output = os.path.join(self.saved_models_path, f'{model_name}_dis_weights.h5')
|
| 159 |
+
gen_output = os.path.join(self.saved_models_path, f'{model_name}_gen_weights.h5')
|
| 160 |
+
if os.path.exists(dis_output) and os.path.exists(gen_output):
|
| 161 |
+
print("The model was trained in the past")
|
| 162 |
+
self.discriminator_model.load_weights(dis_output)
|
| 163 |
+
self.generator_model.load_weights(gen_output)
|
| 164 |
+
prev_output = True
|
| 165 |
+
return dis_output, gen_output, prev_output
|
| 166 |
+
|
| 167 |
+
def train_models(self, batch_size, half_batch_size, index, scaled_data):
|
| 168 |
+
"""
|
| 169 |
+
This function trains the discriminator and the generator
|
| 170 |
+
:param batch_size: batch size
|
| 171 |
+
:param half_batch_size: half of the batch size
|
| 172 |
+
:param index:
|
| 173 |
+
:param scaled_data:
|
| 174 |
+
:return:
|
| 175 |
+
"""
|
| 176 |
+
self.discriminator_model.trainable = True
|
| 177 |
+
|
| 178 |
+
# Create a batch of real data and train the model
|
| 179 |
+
x_real, y_real = self.get_real_samples(data=scaled_data, batch_size=half_batch_size, index=index)
|
| 180 |
+
d_real_loss = self.discriminator_model.train_on_batch(x_real, y_real)
|
| 181 |
+
|
| 182 |
+
# Create a batch of fake data and train the model
|
| 183 |
+
x_fake, y_fake = self.create_fake_samples(batch_size=half_batch_size)
|
| 184 |
+
d_fake_loss = self.discriminator_model.train_on_batch(x_fake, y_fake)
|
| 185 |
+
|
| 186 |
+
avg_dis_loss = 0.5 * (d_real_loss + d_fake_loss)
|
| 187 |
+
|
| 188 |
+
# Create noise for the generator model
|
| 189 |
+
noise = randn(self.noise_dim[0] * batch_size).reshape((batch_size, self.noise_dim[0]))
|
| 190 |
+
|
| 191 |
+
self.discriminator_model.trainable = False
|
| 192 |
+
gen_loss = self.gan_model.train_on_batch(noise, np.ones((batch_size, 1)))
|
| 193 |
+
|
| 194 |
+
return avg_dis_loss, gen_loss
|
| 195 |
+
|
| 196 |
+
@staticmethod
|
| 197 |
+
def get_real_samples(data, batch_size, index):
|
| 198 |
+
"""
|
| 199 |
+
Generate batch_size of real samples with class labels
|
| 200 |
+
:param data: the original data
|
| 201 |
+
:param batch_size: batch size
|
| 202 |
+
:param index: the index of the batch
|
| 203 |
+
:return: x: real samples, y: labels
|
| 204 |
+
"""
|
| 205 |
+
start_index = batch_size * index
|
| 206 |
+
end_index = start_index + batch_size
|
| 207 |
+
x = data[start_index: end_index]
|
| 208 |
+
|
| 209 |
+
return x, np.ones((len(x), 1))
|
| 210 |
+
|
| 211 |
+
def create_fake_samples(self, batch_size):
|
| 212 |
+
"""
|
| 213 |
+
Use the generator to generate n fake examples, with class labels
|
| 214 |
+
:param batch_size: batch size
|
| 215 |
+
:return:
|
| 216 |
+
"""
|
| 217 |
+
noise = randn(self.noise_dim[0] * batch_size).reshape((batch_size, self.noise_dim[0]))
|
| 218 |
+
x = self.generator_model.predict(noise) # create fake samples using the generator
|
| 219 |
+
|
| 220 |
+
return x, np.zeros((len(x), 1))
|
| 221 |
+
|
| 222 |
+
@staticmethod
|
| 223 |
+
def plot_losses(discriminator_loss, generator_loss, losses_output):
|
| 224 |
+
"""
|
| 225 |
+
Plot training loss values
|
| 226 |
+
:param generator_loss:
|
| 227 |
+
:param discriminator_loss:
|
| 228 |
+
:param losses_output:
|
| 229 |
+
:return:
|
| 230 |
+
"""
|
| 231 |
+
plt.plot(discriminator_loss)
|
| 232 |
+
plt.plot(generator_loss)
|
| 233 |
+
plt.xlabel('Iteration')
|
| 234 |
+
plt.ylabel('Loss')
|
| 235 |
+
plt.title('Discriminator and Generator Losses')
|
| 236 |
+
plt.legend(['Discriminator Loss', 'Generator Loss'])
|
| 237 |
+
plt.savefig(losses_output)
|
| 238 |
+
|
| 239 |
+
@staticmethod
|
| 240 |
+
def return_minimum_euclidean_distance(scaled_data, x):
|
| 241 |
+
"""
|
| 242 |
+
This function returns the
|
| 243 |
+
:param scaled_data: the original data
|
| 244 |
+
:param x: a record we want to compare with
|
| 245 |
+
:return: the minimum distance and the index of the minimum value
|
| 246 |
+
"""
|
| 247 |
+
s = np.power(np.power((scaled_data - np.array(x)), 2).sum(1), 0.5)
|
| 248 |
+
return pd.Series([s[s.argmin()], s.argmin()])
|
| 249 |
+
|
| 250 |
+
def test(self, scaled_data, sample_num, pca_output):
|
| 251 |
+
"""
|
| 252 |
+
This function tests the model
|
| 253 |
+
:param scaled_data: the original scaled data
|
| 254 |
+
:param sample_num: number of samples to generate
|
| 255 |
+
:param pca_output: the output of PCA
|
| 256 |
+
:return:
|
| 257 |
+
"""
|
| 258 |
+
x_fake, y_fake = self.create_fake_samples(batch_size=sample_num)
|
| 259 |
+
fake_pred = self.discriminator_model.predict(x_fake)
|
| 260 |
+
|
| 261 |
+
# Filter data to different matrices
|
| 262 |
+
dis_fooled_scaled = np.asarray(list(compress(x_fake, fake_pred > 0.5)))
|
| 263 |
+
dis_not_fooled_scaled = np.asarray(list(compress(x_fake, fake_pred <= 0.5)))
|
| 264 |
+
|
| 265 |
+
# ------------- Euclidean -------------
|
| 266 |
+
mean_min_distance_fooled, mean_min_distance_not_fooled = (-1, -1)
|
| 267 |
+
if len(dis_fooled_scaled) > 0 and len(dis_not_fooled_scaled) > 0:
|
| 268 |
+
mean_min_distance_fooled = self.get_mean_distance_score(scaled_data, dis_fooled_scaled)
|
| 269 |
+
print(f'The mean minimum distance for fooled samples is {mean_min_distance_fooled}')
|
| 270 |
+
mean_min_distance_not_fooled = self.get_mean_distance_score(scaled_data, dis_not_fooled_scaled)
|
| 271 |
+
print(f'The mean minimum distance for not fooled samples is {mean_min_distance_not_fooled}')
|
| 272 |
+
else:
|
| 273 |
+
print(f'The fooled xor the not Fooled data frames is empty')
|
| 274 |
+
|
| 275 |
+
# ------------- PCA --------------
|
| 276 |
+
data_pca_df = self.get_pca_df(scaled_data, 'original')
|
| 277 |
+
dis_fooled_pca_df = self.get_pca_df(dis_fooled_scaled, 'fooled')
|
| 278 |
+
dis_not_fooled_pca_df = self.get_pca_df(dis_not_fooled_scaled, 'not fooled')
|
| 279 |
+
pca_frames = [data_pca_df, dis_fooled_pca_df, dis_not_fooled_pca_df]
|
| 280 |
+
pca_result = pd.concat(pca_frames)
|
| 281 |
+
self.plot_pca(pca_result, pca_output)
|
| 282 |
+
|
| 283 |
+
return dis_fooled_scaled, dis_not_fooled_scaled, mean_min_distance_fooled, mean_min_distance_not_fooled
|
| 284 |
+
|
| 285 |
+
def get_mean_distance_score(self, scaled_data, dis_scaled):
|
| 286 |
+
"""
|
| 287 |
+
This function returns the mean distance score for the given dataframe
|
| 288 |
+
:param scaled_data: the original data
|
| 289 |
+
:param dis_scaled: a dataframe
|
| 290 |
+
:return:
|
| 291 |
+
"""
|
| 292 |
+
dis_fooled_scaled_ecu = pd.DataFrame(dis_scaled)
|
| 293 |
+
dis_fooled_scaled_ecu[['min_distance', 'similar_i']] = dis_fooled_scaled_ecu.apply(
|
| 294 |
+
lambda x: self.return_minimum_euclidean_distance(scaled_data, x), axis=1)
|
| 295 |
+
mean_min_distance_fooled = dis_fooled_scaled_ecu['min_distance'].mean()
|
| 296 |
+
return mean_min_distance_fooled
|
| 297 |
+
|
| 298 |
+
@staticmethod
|
| 299 |
+
def plot_pca(pca_result, pca_output):
|
| 300 |
+
"""
|
| 301 |
+
This function plots the PCA figure
|
| 302 |
+
:param pca_result: dataframe with all the results
|
| 303 |
+
:param pca_output: output path
|
| 304 |
+
:return:
|
| 305 |
+
"""
|
| 306 |
+
fig = plt.figure(figsize=(8, 8))
|
| 307 |
+
ax = fig.add_subplot(1, 1, 1)
|
| 308 |
+
ax.set_xlabel('Principal Component 1', fontsize=15)
|
| 309 |
+
ax.set_ylabel('Principal Component 2', fontsize=15)
|
| 310 |
+
ax.set_title('PCA With Two Components', fontsize=20)
|
| 311 |
+
targets = ['original', 'fooled', 'not fooled']
|
| 312 |
+
colors = ['r', 'g', 'b']
|
| 313 |
+
for target, color in zip(targets, colors):
|
| 314 |
+
indices_to_keep = pca_result['name'] == target
|
| 315 |
+
ax.scatter(pca_result.loc[indices_to_keep, 'comp1'], pca_result.loc[indices_to_keep, 'comp2'],
|
| 316 |
+
c=color, s=50)
|
| 317 |
+
ax.legend(targets)
|
| 318 |
+
ax.grid()
|
| 319 |
+
plt.savefig(pca_output)
|
| 320 |
+
|
| 321 |
+
@staticmethod
|
| 322 |
+
def get_pca_df(scaled_data, data_name):
|
| 323 |
+
"""
|
| 324 |
+
This function creates the PCA dataframe
|
| 325 |
+
:param scaled_data: the original data
|
| 326 |
+
:param data_name: the name of the column
|
| 327 |
+
:return:
|
| 328 |
+
"""
|
| 329 |
+
pca = PCA(n_components=2)
|
| 330 |
+
principal_components = pca.fit_transform(scaled_data)
|
| 331 |
+
principal_df = pd.DataFrame(data=principal_components, columns=['comp1', 'comp2'])
|
| 332 |
+
principal_df['name'] = data_name
|
| 333 |
+
return principal_df
|
nt_gg.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pickle
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import numpy as np
|
| 5 |
+
from keras.layers import Dense, Dropout, LeakyReLU
|
| 6 |
+
from keras.models import Sequential
|
| 7 |
+
from keras.optimizers import Adam
|
| 8 |
+
from numpy.random import randn
|
| 9 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 10 |
+
from sklearn import metrics
|
| 11 |
+
from tqdm import tqdm
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class GG(object):
|
| 15 |
+
|
| 16 |
+
def __init__(self, number_of_features, saved_models_path, learning_rate, dropout, alpha):
|
| 17 |
+
"""
|
| 18 |
+
The constructor for the General Generator class.
|
| 19 |
+
:param number_of_features: Number of features in the data. Used to determine the noise dimensions
|
| 20 |
+
:param saved_models_path: The folder where we save the models.
|
| 21 |
+
"""
|
| 22 |
+
self.saved_models_path = saved_models_path
|
| 23 |
+
self.number_of_features = number_of_features
|
| 24 |
+
|
| 25 |
+
self.generator_model = None
|
| 26 |
+
self.discriminator_model = RandomForestClassifier()
|
| 27 |
+
self.dropout = dropout
|
| 28 |
+
self.alpha = alpha
|
| 29 |
+
self.noise_dim = int(number_of_features / 2)
|
| 30 |
+
self.learning_rate = learning_rate
|
| 31 |
+
self.number_of_features = number_of_features
|
| 32 |
+
self.build_generator() # build the generator.
|
| 33 |
+
self.losses = {'gen_loss': [], 'dis_loss_pred': [], 'dis_loss_proba': []}
|
| 34 |
+
# self.results = {}
|
| 35 |
+
|
| 36 |
+
def build_generator(self):
|
| 37 |
+
"""
|
| 38 |
+
This function creates the generator model for the GG.
|
| 39 |
+
We used a fairly simple MLP architecture.
|
| 40 |
+
:return:
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
self.generator_model = Sequential()
|
| 44 |
+
self.generator_model.add(Dense(int(self.number_of_features * 2), input_shape=(self.noise_dim + 1, )))
|
| 45 |
+
self.generator_model.add(LeakyReLU(alpha=self.alpha))
|
| 46 |
+
|
| 47 |
+
self.generator_model.add(Dense(int(self.number_of_features * 4)))
|
| 48 |
+
self.generator_model.add(LeakyReLU(alpha=self.alpha))
|
| 49 |
+
self.generator_model.add(Dropout(self.dropout))
|
| 50 |
+
|
| 51 |
+
self.generator_model.add(Dense(int(self.number_of_features * 2)))
|
| 52 |
+
self.generator_model.add(LeakyReLU(alpha=self.alpha))
|
| 53 |
+
self.generator_model.add(Dropout(self.dropout))
|
| 54 |
+
|
| 55 |
+
self.generator_model.add(Dense(self.number_of_features, activation='sigmoid'))
|
| 56 |
+
optimizer = Adam(lr=self.learning_rate)
|
| 57 |
+
self.generator_model.compile(loss='categorical_crossentropy', optimizer=optimizer)
|
| 58 |
+
|
| 59 |
+
# self.generator_model.summary()
|
| 60 |
+
|
| 61 |
+
def train_gg(self, x_train, y_train, epochs, batch_size, model_name, data, output_path, to_plot=False):
|
| 62 |
+
"""
|
| 63 |
+
This function running the training stage manually.
|
| 64 |
+
:param output_path: Path to save loss fig
|
| 65 |
+
:param to_plot: Plots the losses if True
|
| 66 |
+
:param x_train: the training set features
|
| 67 |
+
:param y_train: the training set classes
|
| 68 |
+
:param model_name: name of model to save (for generator)
|
| 69 |
+
:param epochs: number of epochs
|
| 70 |
+
:param batch_size: the batch size
|
| 71 |
+
:return: trains the discriminator and generator.
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
losses_path = os.path.join(self.saved_models_path, f'{model_name}_losses')
|
| 75 |
+
model_file = os.path.join(self.saved_models_path, f'{model_name}_part_2_gen_weights.h5')
|
| 76 |
+
|
| 77 |
+
# First train the discriminator
|
| 78 |
+
self.train_black_box_dis(x_train, y_train)
|
| 79 |
+
self.train_generator(x_train, model_file, epochs, batch_size, losses_path)
|
| 80 |
+
if to_plot:
|
| 81 |
+
self.plot_losses(data, output_path)
|
| 82 |
+
|
| 83 |
+
def train_black_box_dis(self, x_train, y_train):
|
| 84 |
+
"""
|
| 85 |
+
Trains the discriminator and saves it.
|
| 86 |
+
:param x_train: the training set features
|
| 87 |
+
:param y_train: the training set classes
|
| 88 |
+
:return:
|
| 89 |
+
"""
|
| 90 |
+
dis_output = os.path.join(self.saved_models_path, 'black_box_dis_model')
|
| 91 |
+
|
| 92 |
+
if os.path.exists(dis_output):
|
| 93 |
+
# print('Blackbox discriminator already trained')
|
| 94 |
+
with open(dis_output, 'rb') as rf_file:
|
| 95 |
+
self.discriminator_model = pickle.load(rf_file)
|
| 96 |
+
|
| 97 |
+
self.discriminator_model.fit(x_train, y_train)
|
| 98 |
+
with open(dis_output, 'wb') as rf_file:
|
| 99 |
+
pickle.dump(self.discriminator_model, rf_file)
|
| 100 |
+
|
| 101 |
+
def train_generator(self, data, model_path, epochs, start_batch_size, losses_path):
|
| 102 |
+
"""
|
| 103 |
+
Function for training the general generator.
|
| 104 |
+
:param losses_path: The filepath for the loss results
|
| 105 |
+
:param data: The normalized dataset
|
| 106 |
+
:param model_path: The name of the model to save. includes epoch size, batches etc.
|
| 107 |
+
:param epochs: Number of epochs
|
| 108 |
+
:param start_batch_size: Size of batch to use.
|
| 109 |
+
:return: trains the generator, saves it and the losses during training.
|
| 110 |
+
"""
|
| 111 |
+
|
| 112 |
+
if os.path.exists(model_path):
|
| 113 |
+
self.generator_model.load_weights(model_path)
|
| 114 |
+
with open(losses_path, 'rb') as loss_file:
|
| 115 |
+
self.losses = pickle.load(loss_file)
|
| 116 |
+
return
|
| 117 |
+
|
| 118 |
+
for epoch in range(epochs): # iterates over the epochs
|
| 119 |
+
np.random.shuffle(data)
|
| 120 |
+
batch_size = start_batch_size
|
| 121 |
+
for i in tqdm(range(0, data.shape[0], batch_size), ascii=True): # Iterate over batches
|
| 122 |
+
if data.shape[0] - i >= batch_size:
|
| 123 |
+
batch_input = data[i:i + batch_size]
|
| 124 |
+
else: # The last iteration
|
| 125 |
+
batch_input = data[i:]
|
| 126 |
+
batch_size = batch_input.shape[0]
|
| 127 |
+
|
| 128 |
+
g_loss = self.train_generator_on_batch(batch_input)
|
| 129 |
+
self.losses['gen_loss'].append(g_loss)
|
| 130 |
+
|
| 131 |
+
self.save_generator_model(model_path, losses_path)
|
| 132 |
+
|
| 133 |
+
def save_generator_model(self, generator_model_path, losses_path):
|
| 134 |
+
"""
|
| 135 |
+
Saves the model and the loss data with pickle.
|
| 136 |
+
|
| 137 |
+
:param generator_model_path: File path for the generator
|
| 138 |
+
:param losses_path: File path for the losses
|
| 139 |
+
:return:
|
| 140 |
+
"""
|
| 141 |
+
self.generator_model.save_weights(generator_model_path)
|
| 142 |
+
with open(losses_path, 'wb+') as loss_file:
|
| 143 |
+
pickle.dump(self.losses, loss_file)
|
| 144 |
+
|
| 145 |
+
def train_generator_on_batch(self, batch_input):
|
| 146 |
+
"""
|
| 147 |
+
Trains the generator for a single batch. Creates the necessary input, comprised of noise and the real
|
| 148 |
+
probabilities obtained from the black box. Compared to the target output, made of real samples and the
|
| 149 |
+
probabilities made up by the generator.
|
| 150 |
+
:param batch_input:
|
| 151 |
+
:return:
|
| 152 |
+
"""
|
| 153 |
+
batch_size = batch_input.shape[0]
|
| 154 |
+
discriminator_probabilities = self.discriminator_model.predict_proba(batch_input)[:, -1:]
|
| 155 |
+
# noise = randn(self.noise_dim * batch_size).reshape((batch_size, self.noise_dim))
|
| 156 |
+
|
| 157 |
+
noise = randn(batch_size, self.noise_dim)
|
| 158 |
+
gen_model_input = np.hstack([noise, discriminator_probabilities])
|
| 159 |
+
generated_probabilities = self.generator_model.predict(gen_model_input)[:, -1:] # Take only probabilities
|
| 160 |
+
target_output = np.hstack([batch_input, generated_probabilities])
|
| 161 |
+
g_loss = self.generator_model.train_on_batch(gen_model_input, target_output) # The actual training
|
| 162 |
+
|
| 163 |
+
return g_loss
|
| 164 |
+
|
| 165 |
+
def plot_discriminator_results(self, x_test, y_test, data, path):
|
| 166 |
+
"""
|
| 167 |
+
:param x_test: Test set
|
| 168 |
+
:param y_test: Test classes
|
| 169 |
+
:return: Prints the required plots.
|
| 170 |
+
"""
|
| 171 |
+
|
| 172 |
+
blackbox_probs = self.discriminator_model.predict_proba(x_test)
|
| 173 |
+
discriminator_predictions = self.discriminator_model.predict(x_test)
|
| 174 |
+
count_1 = int(np.sum(y_test))
|
| 175 |
+
count_0 = int(y_test.shape[0] - count_1)
|
| 176 |
+
class_data = (['Class 0', 'Class 1'], [count_0, count_1])
|
| 177 |
+
self.plot_data(class_data, path, mode='bar', x_title='Class', title=f'Distribution of classes - {data} dataset')
|
| 178 |
+
self.plot_data(blackbox_probs[:, 0], path, title=f'Probabilities for test set - class 0 - {data} dataset')
|
| 179 |
+
self.plot_data(blackbox_probs[:, 1], path, title=f'Probabilities for test set - class 1 - {data} dataset')
|
| 180 |
+
|
| 181 |
+
min_confidence = blackbox_probs[:, 0].min(), blackbox_probs[:, 1].min()
|
| 182 |
+
max_confidence = blackbox_probs[:, 0].max(), blackbox_probs[:, 1].max()
|
| 183 |
+
mean_confidence = blackbox_probs[:, 0].mean(), blackbox_probs[:, 1].mean()
|
| 184 |
+
|
| 185 |
+
print("Accuracy:", metrics.accuracy_score(y_test, discriminator_predictions))
|
| 186 |
+
for c in [0, 1]:
|
| 187 |
+
print(f'Class {c} - Min confidence: {min_confidence[c]} - Max Confidence: {max_confidence[c]} - '
|
| 188 |
+
f'Mean confidence: {mean_confidence[c]}')
|
| 189 |
+
|
| 190 |
+
def plot_generator_results(self, data, path, num_of_instances=1000):
|
| 191 |
+
"""
|
| 192 |
+
Creates plots for the generator results on 1000 instances.
|
| 193 |
+
:param path:
|
| 194 |
+
:param data: Name of dataset used.
|
| 195 |
+
:param num_of_instances: Number of samples to generate.
|
| 196 |
+
:return:
|
| 197 |
+
"""
|
| 198 |
+
sampled_proba, generated_instances = self.generate_n_samples(num_of_instances)
|
| 199 |
+
|
| 200 |
+
proba_fake = self.discriminator_model.predict_proba(generated_instances[:, :-1])
|
| 201 |
+
for c in [0, 1]:
|
| 202 |
+
title = f'Confidence Score for Class {c} of Fake Samples - {data} dataset'
|
| 203 |
+
self.plot_data(proba_fake[:, c], path, x_title='Confidence Score', title=title)
|
| 204 |
+
|
| 205 |
+
black_box_confidence = proba_fake[:, 1:]
|
| 206 |
+
proba_error = np.abs(sampled_proba - black_box_confidence)
|
| 207 |
+
generated_classes = np.array([int(round(c)) for c in generated_instances[:, -1].tolist()]).reshape(1000, 1)
|
| 208 |
+
proba_stats = np.hstack([sampled_proba, generated_classes, proba_fake[:, :1], proba_fake[:, 1:], proba_error])
|
| 209 |
+
|
| 210 |
+
for c in [0, 1]:
|
| 211 |
+
class_data = proba_stats[proba_stats[:, 1] == c]
|
| 212 |
+
class_data = class_data[class_data[:, 0].argsort()] # Sort it for the plot
|
| 213 |
+
title = f'Error rate for different probabilities, class {c} - {data} dataset'
|
| 214 |
+
self.plot_data((class_data[:, 0], class_data[:, -1]), path, mode='plot', y_title='error rate', title=title)
|
| 215 |
+
|
| 216 |
+
def generate_n_samples(self, n):
|
| 217 |
+
"""
|
| 218 |
+
Functions for generating N samples with a uniformly distribution confidence level.
|
| 219 |
+
:param n: Number of samples
|
| 220 |
+
:return: a tuple of the confidence scores used and the samples created.
|
| 221 |
+
"""
|
| 222 |
+
noise = randn(n, self.noise_dim)
|
| 223 |
+
# confidences = np.sort(np.random.uniform(0, 1, (n, 1)), axis=0)
|
| 224 |
+
confidences = np.random.uniform(0, 1, (n, 1))
|
| 225 |
+
|
| 226 |
+
generator_input = np.hstack([noise, confidences]) # Stick them together
|
| 227 |
+
generated_instances = self.generator_model.predict(generator_input) # Create samples
|
| 228 |
+
|
| 229 |
+
return confidences, generated_instances
|
| 230 |
+
|
| 231 |
+
@staticmethod
|
| 232 |
+
def plot_data(data, path, mode='hist', x_title='Probabilities', y_title='# of Instances', title='Distribution'):
|
| 233 |
+
"""
|
| 234 |
+
:param path: Path to save
|
| 235 |
+
:param mode: Mode to use
|
| 236 |
+
:param y_title: Title of y axis
|
| 237 |
+
:param x_title: Title of x axis
|
| 238 |
+
:param data: Data to plot
|
| 239 |
+
:param title: Title of plot
|
| 240 |
+
:return: Prints a plot
|
| 241 |
+
"""
|
| 242 |
+
plt.clf()
|
| 243 |
+
|
| 244 |
+
if mode == 'hist':
|
| 245 |
+
plt.hist(data)
|
| 246 |
+
elif mode == 'bar':
|
| 247 |
+
plt.bar(data[0], data[1])
|
| 248 |
+
else:
|
| 249 |
+
plt.plot(data[0], data[1])
|
| 250 |
+
|
| 251 |
+
plt.title(title)
|
| 252 |
+
plt.ylabel(y_title)
|
| 253 |
+
plt.xlabel(x_title)
|
| 254 |
+
# plt.show()
|
| 255 |
+
path = os.path.join(path, title)
|
| 256 |
+
plt.savefig(path)
|
| 257 |
+
|
| 258 |
+
def plot_losses(self, data, path):
|
| 259 |
+
"""
|
| 260 |
+
Plot the losses while training
|
| 261 |
+
:return:
|
| 262 |
+
"""
|
| 263 |
+
plt.clf()
|
| 264 |
+
plt.plot(self.losses['gen_loss'])
|
| 265 |
+
plt.title('Model loss')
|
| 266 |
+
plt.ylabel('Loss')
|
| 267 |
+
plt.xlabel('Iteration')
|
| 268 |
+
# plt.show()
|
| 269 |
+
plt.savefig(os.path.join(path, f'{data} dataset - general_generator_loss.png'))
|
| 270 |
+
|
| 271 |
+
def get_error(self, num_of_instances=1000):
|
| 272 |
+
"""
|
| 273 |
+
Calculates the error of the generator we created by measuring the difference between the probability that
|
| 274 |
+
was given as input and the probability of the discriminator on the sample created.
|
| 275 |
+
:param num_of_instances: Number of samples to generate.
|
| 276 |
+
:return: An array of errors.
|
| 277 |
+
"""
|
| 278 |
+
sampled_proba, generated_instances = self.generate_n_samples(num_of_instances)
|
| 279 |
+
proba_fake = self.discriminator_model.predict_proba(generated_instances[:, :-1])
|
| 280 |
+
black_box_confidence = proba_fake[:, 1:]
|
| 281 |
+
return np.abs(sampled_proba - black_box_confidence)
|
| 282 |
+
|
outputs/empty
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|