Kaggle Submission Example
Notebook and Code from https://github.com/jeffheaton/t81_558_deep_learning
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
df_train = pd.read_csv(
"https://data.heatonresearch.com/data/t81-558/datasets/"+\
"kaggle_iris_train.csv", na_values=['NA','?'])
# Encode feature vector
df_train.drop('id', axis=1, inplace=True)
num_classes = len(df_train.groupby('species').species.nunique())
print("Number of classes: {}".format(num_classes))
# Convert to numpy - Classification
x = df_train[['sepal_l', 'sepal_w', 'petal_l', 'petal_w']].values
dummies = pd.get_dummies(df_train['species']) # Classification
species = dummies.columns
y = dummies.values
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.25, random_state=45)
# Train, with early stopping
model = Sequential()
model.add(Dense(50, input_dim=x.shape[1], activation='relu'))
model.add(Dense(25))
model.add(Dense(y.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3,
patience=5, verbose=1, mode='auto',
restore_best_weights=True)
model.fit(x_train,y_train,validation_data=(x_test,y_test),
callbacks=[monitor],verbose=0,epochs=1000)
Now that we've trained the neural network, we can check its log loss.
from sklearn import metrics
# Calculate multi log loss error
pred = model.predict(x_test)
score = metrics.log_loss(y_test, pred)
print("Log loss score: {}".format(score))
Now we are ready to generate the Kaggle submission file. We will use the iris test data that does not contain a $y$ target value. It is our job to predict this value and submit to Kaggle.
# Generate Kaggle submit file
# Encode feature vector
df_test = pd.read_csv(
"https://data.heatonresearch.com/data/t81-558/datasets/"+\
"kaggle_iris_test.csv", na_values=['NA','?'])
# Convert to numpy - Classification
ids = df_test['id']
df_test.drop('id', axis=1, inplace=True)
x = df_test[['sepal_l', 'sepal_w', 'petal_l', 'petal_w']].values
y = dummies.values
# Generate predictions
pred = model.predict(x)
#pred
# Create submission data set
df_submit = pd.DataFrame(pred)
df_submit.insert(0,'id',ids)
df_submit.columns = ['id','species-0','species-1','species-2']
# Write submit file locally
df_submit.to_csv("iris_submit.csv", index=False)
print(df_submit)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
import io
import os
import requests
import numpy as np
from sklearn import metrics
save_path = "."
df = pd.read_csv(
"https://data.heatonresearch.com/data/t81-558/datasets/"+\
"kaggle_auto_train.csv",
na_values=['NA', '?'])
cars = df['name']
# Handle missing value
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())
# Pandas to Numpy
x = df[['cylinders', 'displacement', 'horsepower', 'weight',
'acceleration', 'year', 'origin']].values
y = df['mpg'].values # regression
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.25, random_state=42)
# Build the neural network
model = Sequential()
model.add(Dense(25, input_dim=x.shape[1], activation='relu')) # Hidden 1
model.add(Dense(10, activation='relu')) # Hidden 2
model.add(Dense(1)) # Output
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5,
verbose=1, mode='auto', restore_best_weights=True)
model.fit(x_train,y_train,validation_data=(x_test,y_test),
verbose=2,callbacks=[monitor],epochs=1000)
# Predict
pred = model.predict(x_test)
import numpy as np
# Measure RMSE error. RMSE is common for regression.
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))
import pandas as pd
# Generate Kaggle submit file
# Encode feature vector
df_test = pd.read_csv(
"https://data.heatonresearch.com/data/t81-558/datasets/"+\
"kaggle_auto_test.csv", na_values=['NA','?'])
# Convert to numpy - regression
ids = df_test['id']
df_test.drop('id', axis=1, inplace=True)
# Handle missing value
df_test['horsepower'] = df_test['horsepower'].\
fillna(df['horsepower'].median())
x = df_test[['cylinders', 'displacement', 'horsepower', 'weight',
'acceleration', 'year', 'origin']].values
# Generate predictions
pred = model.predict(x)
#pred
# Create submission data set
df_submit = pd.DataFrame(pred)
df_submit.insert(0,'id',ids)
df_submit.columns = ['id','mpg']
# Write submit file locally
df_submit.to_csv("auto_submit.csv", index=False)
print(df_submit)