Anomaly Detection

import numpy as np

import matplotlib.pyplot as plt

from utils import *

%matplotlib inline

# Load the dataset

X_train, X_val, y_val = load_data()

# Display the first five elements of X_train

print("The first 5 elements of X_train are:\n", X_train[:5])

# Display the first five elements of X_val

print("The first 5 elements of X_val are\n", X_val[:5])

# Display the first five elements of y_val

print("The first 5 elements of y_val are\n", y_val[:5])

print ('The shape of X_train is:', X_train.shape)

print ('The shape of X_val is:', X_val.shape)

print ('The shape of y_val is: ', y_val.shape)

# Create a scatter plot of the data. To change the markers to blue "x",

# we used the 'marker' and 'c' parameters

plt.scatter(X_train[:, 0], X_train[:, 1], marker='x', c='b')

# Set the title

plt.title("The first dataset")

# Set the y-axis label

plt.ylabel('Throughput (mb/s)')

# Set the x-axis label

plt.xlabel('Latency (ms)')

# Set axis range

plt.axis([0, 30, 0, 30])

plt.show()

def estimate_gaussian(X):

"""

Calculates mean and variance of all features

in the dataset

Args:

X (ndarray): (m, n) Data matrix

Returns:

mu (ndarray): (n,) Mean of all features

var (ndarray): (n,) Variance of all features

"""

m, n = X.shape

### START CODE HERE ###

mu = 1 / m * np.sum(X, axis = 0)

var = 1 / m * np.sum((X - mu) ** 2, axis = 0 )

### END CODE HERE ###

return mu, var

# Estimate mean and variance of each feature

mu, var = estimate_gaussian(X_train)

print("Mean of each feature:", mu)

print("Variance of each feature:", var)

# UNIT TEST

from public_tests import *

estimate_gaussian_test(estimate_gaussian)

# Returns the density of the multivariate normal

# at each data point (row) of X_train

p = multivariate_gaussian(X_train, mu, var)

#Plotting code

visualize_fit(X_train, mu, var)

def select_threshold(y_val, p_val):

"""

Finds the best threshold to use for selecting outliers

based on the results from a validation set (p_val)

and the ground truth (y_val)

Args:

y_val (ndarray): Ground truth on validation set

p_val (ndarray): Results on validation set

Returns:

epsilon (float): Threshold chosen

F1 (float): F1 score by choosing epsilon as threshold

"""

best_epsilon = 0

best_F1 = 0

F1 = 0

step_size = (max(p_val) - min(p_val)) / 1000

for epsilon in np.arange(min(p_val), max(p_val), step_size):

predictions = (p_val < epsilon)

tp = np.sum((predictions == 1) & (y_val ==1))

fp = np.sum((predictions == 1) & (y_val == 0))

fn = np.sum((predictions == 0) & (y_val == 1))

prec = tp / (tp + fp)

rec = tp / (tp + fn)

F1 = 2 * prec * rec / ( prec + rec)

if F1 > best_F1:

best_F1 = F1

best_epsilon = epsilon

return best_epsilon, best_F1

p_val = multivariate_gaussian(X_val, mu, var)

epsilon, F1 = select_threshold(y_val, p_val)

print('Best epsilon found using cross-validation: %e' % epsilon)

print('Best F1 on Cross Validation Set: %f' % F1)

# UNIT TEST

select_threshold_test(select_threshold)

# Find the outliers in the training set

outliers = p < epsilon

# Visualize the fit

visualize_fit(X_train, mu, var)

# Draw a red circle around those outliers

plt.plot(X_train[outliers, 0], X_train[outliers, 1], 'ro',

markersize= 10,markerfacecolor='none', markeredgewidth=2)

# load the dataset

X_train_high, X_val_high, y_val_high = load_data_multi()

print ('The shape of X_train_high is:', X_train_high.shape)

print ('The shape of X_val_high is:', X_val_high.shape)

print ('The shape of y_val_high is: ', y_val_high.shape)

# Apply the same steps to the larger dataset

# Estimate the Gaussian parameters

mu_high, var_high = estimate_gaussian(X_train_high)

# Evaluate the probabilites for the training set

p_high = multivariate_gaussian(X_train_high, mu_high, var_high)

# Evaluate the probabilites for the cross validation set

p_val_high = multivariate_gaussian(X_val_high, mu_high, var_high)

# Find the best threshold

epsilon_high, F1_high = select_threshold(y_val_high, p_val_high)

print('Best epsilon found using cross-validation: %e'% epsilon_high)

print('Best F1 on Cross Validation Set: %f'% F1_high)

print('# Anomalies found: %d'% sum(p_high < epsilon_high))

# Apply the same steps to the larger dataset

# Estimate the Gaussian parameters

mu_high, var_high = estimate_gaussian(X_train_high)

# Evaluate the probabilites for the training set

p_high = multivariate_gaussian(X_train_high, mu_high, var_high)

# Evaluate the probabilites for the cross validation set

p_val_high = multivariate_gaussian(X_val_high, mu_high, var_high)

# Find the best threshold

epsilon_high, F1_high = select_threshold(y_val_high, p_val_high)

print('Best epsilon found using cross-validation: %e'% epsilon_high)

print('Best F1 on Cross Validation Set: %f'% F1_high)

print('# Anomalies found: %d'% sum(p_high < epsilon_high))

Bu Blogda Ara

Emir of Things

Anomaly Detection

Yorumlar

Yorum Gönder

Bu blogdaki popüler yayınlar

Bias & variance

Gradient descent for linear regression