Interesting code snippets
1. Transformer to select important attributes w.r.t linear correlation
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
class ImportantAttributeSelector(BaseEstimator, TransformerMixin):
# This Transformer helpes in selecting top attributes having linear correlation and eliminating a few
def __init__(self, target_attribute):
self.target_attribute = target_attribute
pass
def fit(self, X, y=None):
return self
def transform(self, X, attr_count):
# calculate correlation matrix
corr_matrix = X.corr()
# project the target attribute
target_corr_matrix = corr_matrix[self.target_attribute]
# target_corr_matrix represents linear relationship in the range of [-1, 1]
# coefficients close to 0, are having less to no linear correlation
# convert all the values to absolute and sort in ascending order
tr_target_corr_matrix = list(map(lambda item: (abs(item[1]), item[0]), target_corr_matrix.items()))
tr_target_corr_matrix.sort(reverse=True)
# create the list of attributes and return the required attribute
attr_set = list(map(lambda item: item[1], tr_target_corr_matrix))
return attr_set[:attr_count]
2. Select only numeric values from the dataset(pandas dataframe)
numeric_data = data.select_dtypes(exclude=["object"]).copy()