Python implementation based on sklearn.
When handling categorical data, it is very common to utilize the LabelEncoder provided by scikit-learn (sklearn).
However, the LabelEncoder does not support multi-column pandas DataFrame. Here I provide a simple implementation of LabelEncoder for multicolumn pandas dataframe based on the LabelEncoder.
- sklearn:
pip install --upgrade sklearn
Usage of the MultiLabelEncoder is the same as LabelEncoder but fit, fit_transform and inverse_transform receive pandas.DataFrame.
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
# a multi-column label encoder for pandas dataframe
class MultiLabelEncoder(object):
def __init__(self):
super().__init__()
self.d = defaultdict(LabelEncoder)
def fit_transform(self, df):
# Encoding the variable
return df.apply(lambda x: self.d[x.name].fit_transform(x))
def fit(self, df):
df.apply(lambda x: self.d[x.name].fit(x))
return None
def transform(self, df):
return df.apply(lambda x: self.d[x.name].transform(x))
def inverse_transform(self, df):
return df.apply(lambda x: self.d[x.name].inverse_transform(x))