# Star Classification

Authors: Mikaila Gossman, Ben Shealy

This notebook demonstrates how to build and train a neural network for the [PLAsTiCC Astronomical Classification](https://www.kaggle.com/c/PLAsTiCC-2018) challenge on Kaggle. This challenge, released in 2018, asked competitors to ”classify astronomical sources that vary with time into different classes.”

## Getting Started

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.metrics
import sklearn.model_selection
import sklearn.preprocessing
from tensorflow import keras

## Load the Data

The dataset consists of light curve data and metadata. Each sample has an object ID denoting the corresponding astronomical object. The `target` column in the metadata is the label that we want to predict.

In [None]:
df_meta = pd.read_csv('plasticc-kit-master/data/training_set_metadata.csv')
df_light = pd.read_csv('plasticc-kit-master/data/training_set.csv')

In [None]:
df_meta.head()

In [None]:
df_light.head()

## Prepare the Training Data

The training data consists of the light curve data and all of the metadata except for the `target` column, so we need to merge the two dataframes that we just loaded.

In [None]:
df_merged = pd.merge(df_meta, df_light)
labels = df_merged['target']

In [None]:
df_merged.head()

Now that we have all of the data together we can perform a correlation analysis to see if any features are highly correlated with each other. This code will produce a correlation heatmap where highly correlated pairs of features will be dark red or blue (depending on whether the correlation is positive or negative).

In [None]:
# compute correlation matrix
corr = df_merged.corr()

# generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# set up the matplotlib figure
fig, ax = plt.subplots(figsize=(10, 8))

# draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
    corr,
    mask=mask,
    cmap='RdBu',
    center=0,
    square=True,
    linewidths=0.2,
    cbar_kws={"shrink": 0.5})

# show the plot
plt.show()

In [None]:
# remove highly correlated fields, and the answer...rip
drop_columns = [
    "object_id",
    "decl",
    "gal_l",
    "gal_b",
    "ddf",
    "distmod",
    "target"
]

data = df_merged.drop(drop_columns, axis=1)

# check the final dataframe
df_merged.head()

In [None]:
# report some basic stats on the columns
data.describe()

In [None]:
# extract data and labels
X = data
y = np.ravel(labels)

In [None]:
X.head()

In [None]:
y.head()

## Prepare Train/Test Data

In [None]:
# convert labels to categorical labels
y_map = sklearn.preprocessing.LabelEncoder().fit_transform(y)
y_cat = keras.utils.to_categorical(y_map)

print(y_cat)

In [None]:
# extract train and test sets
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y_cat, test_size=0.2)

# print shapes of train/test sets
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

In [None]:
# normalize the data
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Train and Evaluate MLP

In [None]:
# define MLP model
mlp = keras.models.Sequential()
mlp.add(keras.layers.Dense(512, activation="selu", input_shape=(8,)))
mlp.add(keras.layers.Dense(512/4, activation="tanh"))
mlp.add(keras.layers.Dense(512/4, activation="relu"))
mlp.add(keras.layers.Dense(14, activation="softmax"))

mlp.summary()

In [None]:
# compile the model
mlp.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["categorical_accuracy"])

# train the model
history = mlp.fit(X_train, y_train, batch_size=500, epochs=200, validation_split=0.1)

In [None]:
# plot training history

# accuracy
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
# evaluate the model
mlp.evaluate(X_test, y_test)

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    # Compute confusion matrix
    cm = sklearn.metrics.confusion_matrix(y_true, y_pred)

    # apply normalization if specified
    if normalize:
        title = "Confusion matrix (normalized)"
        cm = cm.astype("float32") / cm.sum(axis=1)
    else:
        title = "Confusion matrix (not normalized)"

    fig, ax = plt.subplots(figsize=(9,7))
    im = ax.imshow(cm, interpolation="nearest", cmap=cmap)
    ax.figure.colorbar(im, ax=ax)

    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes,
           yticklabels=classes,
           title=title,
           ylabel="True label",
           xlabel="Predicted label")

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = ".2f" if normalize else "d"
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax



# plot confusion matrix to better understand the results
np.set_printoptions(precision=2)

classes = np.unique(y)
y_test2 = np.argmax(y_test, axis=1)
y_pred2 = np.argmax(y_pred, axis=1)

# plot non-normalized confusion matrix
plot_confusion_matrix(y_test2, y_pred2, classes=classes)

# plot normalized confusion matrix
plot_confusion_matrix(y_test2, y_pred2, classes=classes, normalize=True)

plt.show()