import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
file_name = 'input/wine-reviews/winemag-data-130k-v2.csv'
stock_data = pd.read_csv('something.csv')
pd.read_csv(file_name)
stock_data = pd.read_csv('something.csv', index_col=0)
stock_data.shape
stock_data.describe()
stock_data.head()
stock_data.dtypes #OR
stock_data['ticker'].dtype
stock_data.Ticker.unique()
stock_data.sell_date.value_counts() # how many stocks were sold each day
stock_data.groupby('sell_date').sell_date.count()
reviews.groupby('region').region.count()
reviews.groupby('region').price.min()
stock_data.price
stock_data['price']
stock_data['price'][25]
stock_data['price']['TSLA']
stock_data.iloc[25] # row of Tesla stock
stock_data.loc[:, ['price', 'sell_date']]
stock_data.set_index('Ticker')
stock_data.q3_growth > 0 # filters for stocks that have appreciated in the 3rd quarter
stock_data.loc[stock_data.q3_growth > 0] # filters a dataframe of these stocks
stock_data.loc[(stock_data.q3_growth > 0) & (stock_data.q2_growth > 0)] # filters for stocks that have appreciated for 2 quartes
reviews.loc[(reviews.country == 'Italy') | (reviews.country == 'France') ]
reviews.loc[reviews.country.isin(['Italy', 'France'])]
reviews.loc[reviews.price.notnull()]
wine_reviews.loc[wine_reviews.price.isnull()]
wine_reviews['top_rated_regions'] = wine_reviews['country'] + ' - ' + wine_reviews['region_1']
wine_reviews.points.astype('float64') # convert from int to float
wine_reviews.rename(columns={'points': 'score'})
pd.concat([us_wine_reviews, france_wine_reviews])
total_cells = np.product(wine_reviews.shape)
missing_values_count = wine_reviews.isnull().sum()
total_missing = missing_values_count.sum()
percent_missing = (total_missing/total_cells) * 100
print(f'Percent missing: {percent_missing}')
wine_reviews.dropna(subset=['variety'], inplace=True)
wine_reviews.drop(['region_2'], axis=1)
if there's a date present, it's a good idea to check if that column is being recognized as a date dtype
stock_data['sale_date'].dtype
stock_data['sale_date_2'] = pd.to_datetime(stock_data['sale_date'], format='%m/%d/%y')
stock_data['sale_date_2'].head()
stock_data_sell_dates = stock_data['sale_date_2'].dt.day
sns.distplot(stock_data_sell_dates, kde=False, bins=31)
if you have categorical data - you'll prob need to one-hot-encode (multiple columns) or label-encode (single column)
wine_reviews.dropna(subset=['country'], inplace=True)
cat_features = ['country']
encoder = LabelEncoder() #from scikit-learn
encoded = wine_reviews[cat_features].apply(encoder.fit_transform)
wine_reviews['country'] = wine_reviews[cat_features].apply(encoder.fit_transform)
wine_reviews['country-region'] = wine_reviews['country'] + ' - ' + wine_reviews['region_1']
When selecting and narrowing features for a model, there's 2 general approaches to take Univariate methods which consider only one feature at a time or selecting all the best features at once with L1 (Lasso regression) or L2 (Ridge regression) regularization
- L1 - linear model
- L2 - penalizes the square of the coefficients
query_1 = """
SELECT COUNT(consecutive_number) AS num_accidents,
EXTRACT(DAYOFWEEK FROM timestamp_of_crash) AS day_of_week
FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2015`
GROUP BY day_of_week
ORDER BY num_accidents DESC
"""
query_2 = """
WITH time AS
(
SELECT DATE(block_timestamp) AS trans_date
FROM `bigquery-public-data.crypto_bitcoin.transactions`
)
SELECT COUNT(1) AS transactions,
trans_date
FROM time
GROUP BY trans_date
ORDER BY trans_date
"""
from sklearn.tree import DecisionTreeRegressor
melbourne_model = DecisionTreeRegressor(random_state=1)
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
melbourne_model.fit(train_X, train_y)
melbourne_model.predict(X_val()
mean_absolute_error(val_y, val_predictions)
- Decision Trees - parameters to play with - size of a node - depth of tree
- Random forest - makes many trees and averages their predictions (distribution of sorts)
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
# pick the k-number of features
selector = SelectKBest(score_func=f_regression, k=15)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)
# TODO: Which features were selected?
selected_mask = selector.get_support()
all_names = X_train.columns
selected_names = all_names[selected_mask]
unselected_names = all_names[~selected_mask]
print(selected_names)
# OR
print('Features selected:')
for name in selected_names:
print(name)
for k in range(1, len(X_train.columns)+1):
print(f'{k} features')
selector = SelectKBest(score_func=f_regression, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)
model = LinearRegression()
model.fit(X_train_selected, y_train)
y_pred = model.predict(X_test_selected)
mae = mean_absolute_error(y_test, y_pred)
print(f'Test Mean Absolute Error: ${mae:,.0f} \n')