This is study notes for my learn Pandas journey.
I am usign Emacs with Org Mode and would try to use Jupyter (emacs-jupyter) as possible as I can.
I use Python 3.9 while writing this and the python environment is managed with pyenv.
- pyenv
- pyenv-virtualenv
- pyenv-pyright
- pyright
pyenv virtual py3 learn-pandas
pyenv local learn-pandas
pyenv pyright
pip install pandas
pip install jupyter
pip install matplotlib
(jupyter-available-kernelspecs)
import pandas as pd
print(pd.__version__)
mydataset = {
'cars': ["BMW", "Volvo", "Ford"],
'passings': [3, 7, 2]
}
myvar = pd.DataFrame(mydataset)
print(myvar)
import pandas as pd
Create Series from list without index.
a = [1, 7, 2]
my_series = pd.Series(a)
print(my_series)
Create series with index.
my_series_with_index = pd.Series(a, index = ['x', 'y', 'z'])
print(my_series_with_index)
Access value with index.
print(my_series_with_index['y'])
Create series from dict.
calories = {
"day1": 420,
"day2": 380,
"day3": 390
}
my_series_from_dict = pd.Series(calories)
print(my_series_from_dict)
import pandas as pd
DataFrame is a multi-dimensional table.
data = {
"calories": [420, 380, 390],
"duration": [50, 40, 45]
}
myvar = pd.DataFrame(data)
print(myvar)
Locate Rows
print(myvar.loc[0])
print(myvar.loc[[0, 1]])
Load csv file into DataFrame
dt = pd.read_csv("data/data.csv")
print(dt)
Load json file to DataFrame.
df = pd.read_json("data/data.json")
print(df)
Viewing first 10 rows.
print(df.head(10))
import pandas as pd
df = pd.read_csv("data/dirtydata.csv")
print(df)
Remove rows with empty cells
new_df = df.dropna()
print(new_df)
Remove rows with empty cells on the original DataFrame with inplace = True
.
df.dropna(inplace=True)
print(df)
Fix wrong data format as show in row 26
print(df.loc[26])
df['Date'] = pd.to_datetime(df['Date'])
print(df)
Remove rows with wrong data.
for x in df.index:
if df.loc[x, "Duration"] > 120:
df.drop(x, inplace=True)
print(df)
Discovering duplicated data.
print(df.duplicated())
Remove duplicated data.
df.drop_duplicates(inplace=True)
print(df)
import pandas as pd
df = pd.read_csv("data/dirtydata.csv")
df.dropna(inplace=True)
df['Date'] = pd.to_datetime(df['Date'])
for x in df.index:
if df.loc[x, "Duration"] > 120:
df.drop(x, inplace=True)
print(df)
Finding relationships
print(df.corr())
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("data/data.csv")
df
df.plot()
plt.show()
Scatter Plot
df.plot(kind="scatter", x="Duration", y="Calories")
plt.show()
df.plot(kind="scatter", x="Duration", y="Maxpulse")
plt.show()
df["Duration"].plot(kind="hist")