data = pd.read_csv('my_file.csv')
data = pd.read_csv('my_file.csv', sep=';', encoding='latin-1', nrows=1000, skiprows=[2,5])
data.shape
data.describe()
print(s.sum()/s.count())
5 primeiras linhas: Data.head(5)
Colunas: data.loc[8]
Oitavo valor da coluna_1: data.loc[8, 'column_1']
data.loc[range(4,6)]
starts at zero
position = 1
column_name = 'gender'
column_data = pd.Series(['female','male','male'])
df.insert(position, column_name, column_data)
states = pd.Series(['dc','ca','ny'])
df['state'] = states
df.drop(columns=['age','name'],inplace=True)
specify the axis=0 for row, and axis=1 for column.
DF.drop('d', axis=0)
DF.drop('column', axis=1)
& (AND)
~ (NOT)
| (OR)
data[data['column_1']=='french']
data[(data['column_1']=='french') & (data['year_born']==1990)]
data[(data['column_1']=='french') & (data['year_born']==1990) & ~(data['city']=='London')]
data[data['column_1'].isin(['french', 'english'])]
d001 = dfbolhas.drop_duplicates(subset='frame', keep="first")
linhas -> df.iloc[0, 1]
range -> df.iloc[0:3]
dfgotas = dfgotas.rename(columns={"diamEquivFA[mm]": "diamEquivFA[GOTAS]"})
df2 = df1.copy()
level_map = {1: 'high', 2: 'medium', 3: 'low'}
df['c_level'] = df['c'].map(level_map)
df.select_dtypes(include=['float64', 'int64'])
def rule(x, y):
if x == 'high' and y > 10:
return 1
else:
return 0df = pd.DataFrame({ 'c1':[ 'high' ,'high', 'low', 'low'], 'c2': [0, 23, 17, 4]})
df['new'] = df.apply(lambda x: rule(x['c1'], x['c2']), axis = 1)
df.head()
df['maximum'] = df[['c1','c2']].max(axis =1)
df['c'].value_counts().sort_index()
df['c'].value_counts()
import pandas as pd
import numpy as np
df = pd.DataFrame({ 'id': [1,2,3], 'c1':[0,0,np.nan], 'c2': [np.nan,1,1]})
df = df[['id', 'c1', 'c2']]
df['num_nulls'] = df[['c1', 'c2']].isnull().sum(axis=1)
df.head()
df_filter = df['ID'].isin(['A001','C022',...])
df[df_filter]
import numpy as np
cut_points = [np.percentile(df['c'], i) for i in [50, 80, 95]]
df['group'] = 1
for i in range(3):
df['group'] = df['group'] + (df['c'] < cut_points[i])
#or <= cut_points[i]
data.to_csv('my_new_file.csv', index=None)
print(df[:5].to_csv())
Linha-> data['column_numerical'].plot()
Histograma-> data['column_numerical'].hist()
data.loc[8, 'column_1'] = 'english'
data.loc[data['column_1']=='french', 'column_1'] = 'French'
data['column_1'].value_counts()
data['column_1'].map(len)
data['column_1'].map(len).map(lambda x: x/100).plot()
data.corr()
data.corr().applymap(lambda x: int(x*100)/100)
pd.plotting.scatter_matrix(data, figsize=(12,8))
data.merge(other_data, on=['column_1', 'column_2', 'column_3'])
data.groupby('column_1')['column_2'].apply(sum).reset_index()
df = pd.DataFrame({'name': ['alice','bob','charlie'], date_of_birth': ['27/05/2001','16/02/1999','25/09/1998']})
[['name','date_of_birth']]df['date_of_birth'] = pd.to_datetime(df['date_of_birth'],format='%d/%m/%Y')
def segmentMatch(TimeCol, ResponseCol):
result = TimeCol/ResponseCol
return result
df['NewCol'] = df.apply(lambda x: segmentMatch(x['TimeCol'], x['ResponseCol']), axis=1)
df.to_csv('random_data.gz', compression='gzip', index=False)
df = pd.read_csv('random_data.gz')
df.fillna(0)
df.fillna(method='bfill')
df.fillna(method='ffill')
https://towardsdatascience.com/10-python-pandas-tricks-that-make-your-work-more-efficient-2e8e483808ba
https://towardsdatascience.com/be-a-more-efficient-data-scientist-today-master-pandas-with-this-guide-ea362d27386
http://queirozf.com/entries/pandas-dataframe-examples-column-operations