SirWilliam254 / DataCleaning

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

DataCleaning

import pandas as pd
# Define a function to clean the data
def clean_data(df):
    # Drop missing values if less than 5% are missing
    if df.isna().mean().mean() < 0.05:
        df.dropna(inplace=True)
    else:
        # Allow user to choose a fill method
        fill_method = st.radio("Choose a fill method:", ['mean', 'median', 'mode'])
        if fill_method == 'mean':
            df.fillna(df.mean(), inplace=True)
        elif fill_method == 'median':
            df.fillna(df.median(), inplace=True)
        else:
            df.fillna(df.mode(), inplace=True)

    # Drop duplicates and outliers
    df.drop_duplicates(inplace=True)
    df = df[(df - df.mean()).abs() < 3 * df.std()]

    return df

About

License:MIT License