Machine Learning Basic Tutorial

In this post, we provide a basic framework for machine learning practice (basically based on sklearn and Pandas).

Here we first show the basic steps:

Data Minification (if needed)
Data Preprocessing

Classification

Libraries

import pandas as pd
import os
# Sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
# Metrics
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,f1_score,\
precision_recall_curve,precision_score,recall_score,auc,roc_curve
# XGBoost
from xgboost import XGBClassifier

Auxiliary Functions

## Classification Metrics
def precision(y_test,result):
    scores={}
    #scores['classification_report']=classification_report(y_test,result)
    scores['accuracy_score']=accuracy_score(y_test,result)
    scores['confusion_matrix']=confusion_matrix(y_test,result)
    scores['f1_score']=f1_score(y_test,result)
    #scores['precision_recall_curve']=precision_recall_curve(y_test,result)
    scores['precision_score']=precision_score(y_test,result)
    scores['recall_score']=recall_score(y_test,result)
    return scores
  
## Dataframe downsample
def df_downsample(df, n_sample): 
    if n_sample > len(df.index):
        raise Exception('n_sample too large!')
    # Reindex    
    index = random.sample(range(0, len(df.index)), n_sample)    
    df = df.reset_index(drop=True)
    df_downsample = df.iloc[index,:]
    print('Origin:', len(df.index), 'After:', len(df_downsample.index))
    return df_downsample

Read Data

## Read from local file
data_path = 'where_your_file_is_in'
filename = "your_data.csv"
file_path = os.path.join(data_path,filename)
df_data = pd.read_csv(file_path)
# print(df_pos.iloc[0])

Preprocessing

## Check data types
print('\nData types.')
print(data.dtypes)

## Data type transformation (object to string, string to numeric)
df_data['feature_1_str'] = df_data['feature_1_obj'].apply(lambda x: str(x))
df_data['feature_2_num'] = pd.to_numeric(df_data['feature_2_str'])

## Missing data
# Fill NULL with zeros
df_data['some_feature'].fillna(0,inplace=True)
# Fill all NaN with scalar
# data.fillna(0,inplace=True)
# data.replace({'nan': 0}, inplace=True)

## Remove outliers
# Remove some rows with NULL/Zeros
df_data = df_data[df_data.some_feature != 0]

## A quick description of the data
df_data.describe()
## Good habit: have a look at the hist of the data
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
df_data.hist(bins=10, ax=ax)

Feature Engineering

## Generate new features
df_data['some_new_feature']=df_data.apply(some_function,axis=1)

Training

## Train with existing libraries (sklearn)
# Import methods from libraries
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

## Split train and test
X_train = 
X_test = 
y_train = 
y_test = 

## Class rebalance
# Before resample
print('Before resample:')
print(X_train[LABEL].value_counts())
# Separate majority and minority classes (two class problem)
X_train_majority = X_train[X_train[LABEL]==list(X_train[LABEL].value_counts().keys())[0]]
X_train_minority = X_train[X_train[LABEL]==list(X_train[LABEL].value_counts().keys())[1]]
# Downsample majority class
X_train_majority_downsampled= df_downsample(X_train_majority, len(X_train_minority.index))
# Combine minority class with downsampled majority class
X_train = pd.concat([X_train_majority_downsampled, X_train_minority])
# Display class counts afterwards
print('After resample:\n', X_train[LABEL].value_counts())

## Build model
# GBDT
alpha = 0.95
clf = GradientBoostingClassifier(n_estimators=250, max_depth=3,
                                learning_rate=.1, min_samples_leaf=9,
                                min_samples_split=9)
# Random Forest
regr = RandomForestClassifier(max_depth=2, random_state=0)
# KNN
neigh = KNeighborsClassifier(n_neighbors=2)
# SVM
svm = SVC(C=1.0)
# Neural network
nn = MLPClassifier()
# XGBoost
xgb=XGBClassifier()

method_dict = {'GBDT':clf, 'Random Forest':regr, 'KNN':neigh, 'Neural Network':nn,
               'SVM':svm, 'XGBoost':xgb}

print(X_train.head(5))

## Cross Validation
for method in method_dict: 
    method_dict[method].fit(X_train.values,y_train.values.ravel())
    y_pred = method_dict[method].predict(X_test.values)
    print(method)
    # Evaluation
    scores = precision(y_test,y_pred)
    for key in scores:
        print(key, scores[key])
    print('\n')

Evaluation

## Some metrics to evaluate the models
# Test the model on (new) data
ypred = myMethod.predict(X_test)

# Precision

# Recall

# AUC

# other metrics

Regression

sklearn also provides many linear regression methods. Such as Support Vector Regression (SVR).

Yi Ding