mpolinowski / python-asserts-cheatsheet

Python Asserts in Data Science Cheat Sheet

Home Page:https://mpolinowski.github.io/docs/Development/Python/2023-05-18-python-asserts/2023-05-18

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Python Asserts Cheat Sheet

Datasets

Validating and Verifying Data

Yellow and green taxi trip records include fields capturing pick-up and drop-off dates/times, pick-up and drop-off locations, trip distances, itemized fares, rate types, payment types, and driver-reported passenger counts. The data used in the attached datasets were collected and provided to the NYC Taxi and Limousine Commission (TLC) by technology providers authorized under the Taxicab & Livery Passenger Enhancement Programs (TPEP/LPEP). The trip data was not created by the TLC, and TLC makes no representations as to the accuracy of these data.

!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet -P dataset
import csv
import math
import numpy as np
import numpy.testing as npt
import pandas as pd
import pandas.testing as pdt
yellow_tripdata_df = pd.read_parquet(
    'dataset/yellow_tripdata_2023-01.parquet'
)

yellow_tripdata_df.to_csv('dataset/yellow_tripdata_2023-01.csv')
yellow_tripdata_df = pd.read_csv(
    'dataset/yellow_tripdata_2023-01.csv',
    parse_dates=['tpep_pickup_datetime','tpep_dropoff_datetime'],
    nrows=1000
)
yellow_tripdata_df.head(5)
# https://www.kaggle.com/datasets/neomatrix369/nyc-taxi-trip-duration-extended
trip_ext_df = pd.read_csv('dataset/nyc_trip_duration_extended.csv')
trip_ext_df.head(5)

Introduction to Asserts

Asserts in Python

# simple assert
x = 'five'
assert x == 5
# AssertionError
list = [6,2,3,4,5]
assert all(list[i] <= list[i+1] for i in range(len(list)-1))
# AssertionError
def add(a,b):
    return a + b

assert add(2,3) < 5
AssertionError
trip_ext_df.columns

# Index(['name', 'district', 'neighbourhood', 'latitude', 'longitude',
#        'geonumber'],
#       dtype='object')
with open('dataset/nyc_trip_duration_extended.csv') as f:
    reader = csv.DictReader(f)
    
    expected_columns = ['name', 'district', 'neighbourhood', 'latitude', 'longitude',
       'geonumber', 'missing_column']
    
    assert reader.fieldnames == expected_columns, f"Expected columns: {expected_columns}, but got {reader.fieldnames}"
    
# AssertionError:
# Expected columns: ['name', 'district', 'neighbourhood', 'latitude', 'longitude', 'geonumber', 'missing_column'],
# but got ['name', 'district', 'neighbourhood', 'latitude', 'longitude', 'geonumber']
with open('dataset/yellow_tripdata_2023-01.csv') as f:
    reader = csv.DictReader(f)
    
    for row in reader:
        # check passenger count is positive int
        assert float(row['passenger_count']) > 0., f"ERROR :: Invalid Passenger Count: {row['passenger_count']}"
        
# AssertionError: ERROR :: Invalid Passenger Count: 0.0
# how many trips were without passengers
trips_without_passengers = 0

with open('dataset/yellow_tripdata_2023-01.csv') as f:
    next(f) # # skip header
    
    for line in f:
        values = line.strip().split(',')
        trip_id = values[0]
        passenger_count = values[4]
        if passenger_count == '0.0':
            trips_without_passengers += 1
            
print(f'Trips without passengers: {trips_without_passengers}')
# Trips without passengers: 51164
perct_zero_trips = len(zero_trips) * 100 / len(yellow_tripdata_df)
print("%.2f" % perct_zero_trips + ' %')
# 1.67 %

Asserts in Pandas

Indices

index1 = pd.Index([1,2,3])
index2 = pd.Index([1,2,'three'])

pdt.assert_index_equal(index1, index2)

# Index classes are different
# [left]:  Int64Index([1, 2, 3], dtype='int64')
# [right]: Index([1, 2, 'three'], dtype='object')
index1 = pd.Index([1,2,3])
index2 = pd.Index([3,2,1])

pdt.assert_index_equal(index1, index2, check_order=True)
# Index values are different (66.66667 %)
# [left]:  Int64Index([1, 2, 3], dtype='int64')
# [right]: Int64Index([3, 2, 1], dtype='int64')
index1 = pd.Index([1.0,2.0,3.0])
index2 = pd.Index([1.0,2.0,3.1])

pdt.assert_index_equal(index1, index2, check_exact=False, atol=0.1)

# Index values are different (33.33333 %)
# [left]:  Float64Index([1.0, 2.0, 3.0], dtype='float64')
# [right]: Float64Index([1.0, 2.0, 3.1], dtype='float64')
index1 = pd.Index(yellow_tripdata_df['tpep_pickup_datetime'].dt.date)
index1[:3]
# Index([2023-01-01, 2023-01-01, 2023-01-01], dtype='object', name='tpep_pickup_datetime')
index2 = pd.Index(yellow_tripdata_df['tpep_dropoff_datetime'].dt.date)
index2[:3]
# Index([2023-01-01, 2023-01-01, 2023-01-01], dtype='object', name='tpep_dropoff_datetime')
pdt.assert_index_equal(index1, index2, check_exact=True, check_names=False)

# Index values are different (0.3 %)
# [left]:  Index([2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        ...
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01],
#       dtype='object', name='tpep_pickup_datetime', length=1000)
# [right]: Index([2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        ...
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01],
#       dtype='object', name='tpep_dropoff_datetime', length=1000)
# show difference
index_diff = yellow_tripdata_df[
    index1 != index2
][
    ['tpep_pickup_datetime','tpep_dropoff_datetime']
]

index_diff.head()

#     tpep_pickup_datetime tpep_dropoff_datetime
# 383  2023-01-01 00:36:07   2023-01-02 00:17:13
# 567  2022-12-31 23:59:37   2023-01-01 00:07:28
# 761  2022-12-31 23:58:27   2023-01-01 00:02:21

Series

s1 = pd.Series([1,2,3], name='series1')
s2 = pd.Series([1,2,3], name='series2')

pdt.assert_series_equal(s1,s2,check_names=True)

# Attribute "name" are different
# [left]:  series1
# [right]: series2
s1 = pd.Series([1,2,3], name='series1')
s2 = pd.Series(['1','2','3'], name='series2')

pdt.assert_series_equal(s1,s2,check_names=False, check_dtype=False)

# Series values are different (100.0 %)
# [index]: [0, 1, 2]
# [left]:  [1, 2, 3]
# [right]: [1, 2, 3]
pickup_series = yellow_tripdata_df['tpep_pickup_datetime'].dt.date
pickup_series.head(2)

# 0    2023-01-01
# 1    2023-01-01
# Name: tpep_pickup_datetime, dtype: object
dropoff_series = yellow_tripdata_df['tpep_dropoff_datetime'].dt.date
pickup_series.head(2)

# 0    2023-01-01
# 1    2023-01-01
# Name: tpep_pickup_datetime, dtype: object
pdt.assert_series_equal(pickup_series, dropoff_series, check_exact=True, check_names=False)

# AssertionError: Series are different
# Series values are different (0.3 %)
# drop values that don't fit
index_diff = pickup_series.index[pickup_series != dropoff_series]
yellow_tripdata_df_drop = yellow_tripdata_df.drop(index_diff).reset_index(drop=True)
# rebuild series
pickup_series = yellow_tripdata_df_drop['tpep_pickup_datetime'].dt.date
dropoff_series = yellow_tripdata_df_drop['tpep_dropoff_datetime'].dt.date
# re-check - this times it works
pdt.assert_series_equal(pickup_series, dropoff_series, check_exact=True, check_names=False)

DataFrames

df1 = pd.DataFrame({'A': [1,2,3], 'B': [3,2,1]})
df2 = pd.DataFrame({'B': [3,2,1], 'A': [1,2,3]})

pdt.assert_frame_equal(df1,df2,check_like=False)

# DataFrame.columns values are different (100.0 %)
# [left]:  Index(['A', 'B'], dtype='object')
# [right]: Index(['B', 'A'], dtype='object')
pickup_df = yellow_tripdata_df.copy()
dropoff_df = yellow_tripdata_df.copy()

pickup_df['date'] = yellow_tripdata_df['tpep_pickup_datetime'].dt.date
dropoff_df['date'] = yellow_tripdata_df['tpep_dropoff_datetime'].dt.date

pdt.assert_frame_equal(
    pickup_df[['date']],
    dropoff_df[['date']],
    check_exact=True,
    check_names=False
)

# AssertionError: DataFrame.iloc[:, 0] (column name="date") are different
# DataFrame.iloc[:, 0] (column name="date") values are different (0.3 %)
# get index of mismatched rows
index_diff = pickup_df.index[
    pickup_df['date'].ne(dropoff_df['date'])
]

index_diff
# Int64Index([383, 567, 761], dtype='int64')
# drop rows at those indices
pickup_df_drop = pickup_df.drop(index_diff)
dropoff_df_drop = dropoff_df.drop(index_diff)

# verify that assert now works
pdt.assert_frame_equal(
    pickup_df_drop[['date']],
    dropoff_df_drop[['date']],
    check_exact=True,
    check_names=False
)

Asserts in Numpy

a = np.array([1,2,3])
b = np.array([3,2,1])

npt.assert_array_equal(a,b)

# AssertionError: Arrays are not equal
# Mismatched elements: 2 / 3 (66.7%)
# Max absolute difference: 2
# Max relative difference: 2.
#  x: array([1, 2, 3])
#  y: array([3, 2, 1])
string1 = 'string'
string2 = 'STRING'

npt.assert_string_equal(string1,string2)
a = np.array([1,2,3])
b = np.array([0.98,2.02,2.98])

npt.assert_allclose(a,b,atol=0.01)

# AssertionError: Not equal to tolerance rtol=1e-07, atol=0.01
# Mismatched elements: 3 / 3 (100%)
# Max absolute difference: 0.02
# Max relative difference: 0.02040816
# x: array([1, 2, 3])
# y: array([0.98, 2.02, 2.98])
a = np.array([1,2,3])
b = np.array([2,3,4])

npt.assert_array_less(a,b)
npt.assert_array_less(b,a)

# AssertionError: Arrays are not less-ordered
# Mismatched elements: 3 / 3 (100%)
# Max absolute difference: 1
# Max relative difference: 1.
#  x: array([2, 3, 4])
#  y: array([1, 2, 3])

Assert-based Testing

Quantitative Tests

def test_for_missing_data(df):
    # count all missing values and assert number to be zero
    assert df.isnull().sum().sum() == 0, 'ERROR :: DataFrame contains missing data!'
    return True
assert test_for_missing_data(trip_ext_df)
# True
def test_non_numerical_data_types(df, columns):
    for col in columns:
        assert df[col].dtype == 'int64' or df[col].dtype =='float64', f'ERROR :: {col} has a non-numerical dType'
    return True
test_columns = ['neighbourhood','latitude','longitude','geonumber']
assert test_non_numerical_data_types(trip_ext_df, trip_ext_df[test_columns])
# AssertionError: ERROR :: neighbourhood has a non-numerical dType
def test_for_out_of_range(df, columns):
    for col in columns:
        assert df[col].dtype == 'int64' or df[col].dtype == 'float64', f'ERROR :: {col} has a non-numerical dType'
        assert df[col].max() <= math.inf, f'ERROR :: {col} contains infinite values'
        assert df[col].min() >= -math.inf, f'ERROR :: {col} contains infinite values'
        assert not np.isnan(df[col]).any(), f'ERROR :: {col} contains NaN values'
        assert not np.isinf(df[col]).any(), f'ERROR :: {col} contains infinite values'
    return True
test_columns = ['latitude','longitude','geonumber']
assert test_non_numerical_data_types(trip_ext_df, trip_ext_df[test_columns])
# True

Logical Tests

def test_for_logical_errors(df):
    # all dropoffs AFTER pickups
    assert all(df['tpep_dropoff_datetime'] > df['tpep_pickup_datetime']), 'ERROR :: Drop-off time before pickup'
    # no negative trip distances
    assert (df['trip_distance'] >= 0).all(), 'ERROR :: Negative trip distances'
    # no negative passenger count
    assert (df['passenger_count'] >= 0).all(), 'ERROR :: Negative passenger count'
    
    return True
assert test_for_logical_errors(yellow_tripdata_df)
# True

About

Python Asserts in Data Science Cheat Sheet

https://mpolinowski.github.io/docs/Development/Python/2023-05-18-python-asserts/2023-05-18


Languages

Language:Jupyter Notebook 100.0%