Browse Source

Trained basic multiclass SVM

Simon Lousky 1 month ago
parent
commit
3481036685
6 changed files with 155 additions and 0 deletions
  1. 44
    0
      code/featurization.py
  2. 57
    0
      code/train_model.py
  3. 4
    0
      data/.gitignore
  4. 28
    0
      dvc.lock
  5. 21
    0
      dvc.yaml
  6. 1
    0
      metrics/train_metric.json

+ 44
- 0
code/featurization.py

@@ -0,0 +1,44 @@
+"""
+Create feature CSVs for train and test datasets
+"""
+import json
+import numpy as np
+import pandas as pd
+
+
+def featurization():
+    # Load data-sets
+    print("Loading data sets...")
+    train_data = pd.read_csv('./data/train_data.csv', header=None, dtype=float)
+    test_data = pd.read_csv('./data/test_data.csv', header=None, dtype=float)
+    print("done.")
+
+    # Normalize the train data
+    print("Normalizing data...")
+    # We choose all columns except the first, since that is where our labels are
+    train_mean = train_data.values[:, 1:].mean()
+    train_std = train_data.values[:, 1:].std()
+
+    # Normalize train and test data according to the train data distribution
+    train_data.values[:, 1:] -= train_mean
+    train_data.values[:, 1:] /= train_std
+    test_data.values[:, 1:] -= train_mean
+    test_data.values[:, 1:] /= train_std
+
+    print("done.")
+
+    print("Saving processed datasets and normalization parameters...")
+    # Save normalized data-sets
+    np.save('./data/processed_train_data', train_data)
+    np.save('./data/processed_test_data', test_data)
+
+    # Save mean and std for future inference
+    with open('./data/norm_params.json', 'w') as f:
+        json.dump({'mean': train_mean, 'std': train_std}, f)
+
+    print("done.")
+
+
+if __name__ == '__main__':
+    featurization()
+

+ 57
- 0
code/train_model.py

@@ -0,0 +1,57 @@
+"""
+Train classification model for MNIST
+"""
+import json
+import pickle
+import numpy as np
+from sklearn.svm import SVC
+from sklearn.multiclass import OneVsRestClassifier
+import time
+
+
+def train_model():
+    # Measure training time
+    start_time = time.time()
+
+    # Load training data
+    print("Load training data...")
+    train_data = np.load('./data/processed_train_data.npy')
+
+    # Choose a random sample of images from the training data.
+    # This is important since SVM training time increases quadratically with the number of training samples.
+    print("Choosing smaller sample to shorten training time...")
+    # Set a random seed so that we get the same "random" choices when we try to recreate the experiment.
+    np.random.seed(42)
+
+    num_samples = 5000
+    choice = np.random.choice(train_data.shape[0], num_samples, replace=False)
+    train_data = train_data[choice, :]
+
+    # Divide loaded data-set into data and labels
+    labels = train_data[:, 0]
+    data = train_data[:, 1:]
+    print("done.")
+
+    # Define SVM classifier and train model
+    print("Training model...")
+    model = OneVsRestClassifier(SVC(kernel='linear'), n_jobs=6)
+    model.fit(data, labels)
+    print("done.")
+
+    # Save model as pkl
+    print("Save model and training time metric...")
+    with open("./data/model.pkl", 'wb') as f:
+        pickle.dump(model, f)
+
+    # End training time measurement
+    end_time = time.time()
+
+    # Create metric for model training time
+    with open('./metrics/train_metric.json', 'w') as f:
+        json.dump({'training_time': end_time - start_time}, f)
+    print("done.")
+
+
+if __name__ == '__main__':
+    train_model()
+

+ 4
- 0
data/.gitignore

@@ -1,2 +1,6 @@
 /train_data.csv
 /test_data.csv
+/norm_params.json
+/processed_train_data.npy
+/processed_test_data.npy
+/model.pkl

+ 28
- 0
dvc.lock

@@ -0,0 +1,28 @@
+featurization:
+  cmd: python code/featurization.py
+  deps:
+  - path: code/featurization.py
+    md5: e570a5b45022e46e9d6ad9cd6f2a1887
+  - path: data/test_data.csv
+    md5: c807df8d6d804ab2647fc15c3d40f543
+  - path: data/train_data.csv
+    md5: 5b49cf1b57fb9d6102b559d59d99df7c
+  outs:
+  - path: data/norm_params.json
+    md5: e46984ac8b7097bfddfe5d9210f78ca4
+  - path: data/processed_test_data.npy
+    md5: a5257a91e73920bdd4cafd0f88105b74
+  - path: data/processed_train_data.npy
+    md5: 9ee0468925c998fda26d197a14d1caec
+training:
+  cmd: python code/train_model.py
+  deps:
+  - path: code/train_model.py
+    md5: 655c3242c17b3d0213d7ce4d9f78344d
+  - path: data/processed_train_data.npy
+    md5: 9ee0468925c998fda26d197a14d1caec
+  outs:
+  - path: data/model.pkl
+    md5: d72417286b48ba89f02d83ca3f9642d4
+  - path: metrics/train_metric.json
+    md5: 27598e27b215ebb84ec027f92fc1b0ca

+ 21
- 0
dvc.yaml

@@ -0,0 +1,21 @@
+stages:
+  featurization:
+    cmd: python code/featurization.py
+    deps:
+    - code/featurization.py
+    - data/test_data.csv
+    - data/train_data.csv
+    outs:
+    - data/norm_params.json
+    - data/processed_test_data.npy
+    - data/processed_train_data.npy
+  training:
+    cmd: python code/train_model.py
+    deps:
+    - code/train_model.py
+    - data/processed_train_data.npy
+    outs:
+    - data/model.pkl
+    metrics:
+    - metrics/train_metric.json:
+        cache: false

+ 1
- 0
metrics/train_metric.json

@@ -0,0 +1 @@
+{"training_time": 11.965423107147217}