Psingh12354 / Pandas

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Pandas Link

  • To get one or more rows use loc
  • df.to_string() to print entire dataframe
  • Use head or tail to get first few value or last few value. df.tail()
  • To get info about the data use df.info()
  • to deal with empty cells is to remove rows that contain empty cells. df.dropna() or df.dropna(inplace = True) 2nd one will make the changes permanent.
  • to get total number of rows len(df)
  • The fillna() method allows us to replace empty cells with a value df.fillna(130, inplace = True)
  • We can assign the column a same data type like df['Date'] = pd.to_datetime(df['Date'])
  • The duplicated() method returns a Boolean values for each row: print(df.duplicated())
  • To check which file you are in print(__main__)
  • Use _ as placeholder to store value which you are going to ignore. In below example it will ignore 2nd index
tup = ('Priyanshu',23,'PA')
name,_,desig = tup
print(name,_,desig)

Useful content

pip install markdown
import markdown
html = markdown.markdown(your_text_string)
import pandas
mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

var = pandas.DataFrame(mydataset)
var
    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2

df.loc[0]
cars        BMW
passings      3
Name: 0, dtype: object

df.loc[[0,1]]
    cars  passings
0    BMW         3
1  Volvo         7
import numpy as np
np_array = np.array([1,2,4,2])
np_array
array([1, 2, 4, 2])
pd_series = pd.Series(np_array)
pd_series
0    1
1    2
2    4
3    2
dtype: int32
pd_series = pd.Series(np_array,index=['a','b','c','d'])
                      
pd_series
                      
a    1
b    2
c    4
d    2
dtype: int32
pd_series['a]
          
SyntaxError: unterminated string literal (detected at line 1)
pd_series['a']
          
1
pd_series[['a','c']]
          
a    1
c    4
dtype: int32

Excel

df = pd.read_excel(r'C:\Sample Data\data_1.xlsx')
# use sheet number starting from 0
df = pd.read_excel(r'C:\Sample Data\data_1.xlsx',sheet_name=0)
# by name
df = pd.read_excel(r'C:\Sample Data\data_1.xlsx',sheet_name='Orders')
# can define header startign from row 0
df = pd.read_excel(r'C:\Sample Data\data_1.xlsx',sheet_name='Orders',header=3)
# We can create any col as index col like below
df = pd.read_excel(r'C:\Sample Data\data_1.xlsx',sheet_name='Orders',header=3,index_col='Row ID')
# To get the data types
df.dtypes
# To perform basic statistical operation use below command
df.describe()
# to list down all the columns
df.columns
# BY DEFAULT Pandas drop a row
# by default inplace is false u need to define it true to make the changes visible to view level.
df.drop('State',axis=1,inplace=True)
# To add new column with certain operation
df['Per Units Sales'] = df['Sales']/df['Quantity']
# to write save the data in excel use below command
df.to_excel('Path/filename')
df.to_excel(r'C:\Sample Data\data_2.xlsx')
# If don't want to go with index drop in generated excel
df.to_excel(r'C:\Sample Data\data_3.xlsx',index=False)
# to display all the values

pd.set_option('display.max_columns', None)

# to change the dtypes

df['Postal Code'] = df['Postal Code'].astype(str)

Date

# to format the order date
df['Order Date'] = pd.to_datetime(df['Order Date'],errors = 'raise',format = '%Y-%m-%d')
# to extract year similart 'month'
df['year'] = df['Order Date'].dt.year
# to get the shipping date
df["Shipping Date"] = df["Ship Date"] - df["Order Date"]
# small b and cap B represt nov and November like that 
df['String Date'] = df['Order Date'].dt.strftime('%B-%Y')

Data

It checks whether it has any null or nan value and returns bool value
df.isna()
# to get the sum of all na values
df.isna().sum()
# row wise
df.isna().sum(axis=1)
# to get all the rows which has NAN values
df[df.isna().sum(axis=1)>0]
# how = any represent drop the row or col if it has null value
# if all single value of row and col is null use how = all
df.dropna(how='all',inplace=True)
# fillna
df.fillna(0,inplace=True)

To split the excel row wise using explode

# The pandas explode() method is used to transform each element of a list-like structure (such as Python lists, tuples, sets, or NumPy arrays) to a separate row.
df.to_excel('C:\Sample Data\Python1.xlsx',index=False)
  
df = pd.read_excel(r'C:\Sample Data\Book4.xlsx',header=1)
  
df['ColName'] = df['ColName'].str.split(', ')
  
df= df.explode('ColName')
  
df.to_excel('C:\Sample Data\Python1.xlsx',index=False)

Combining dataframe

nutrition = pd.DataFrame({"item":["pizza","pastry","burritto","salad","pasta"],
                         "avg_calorie":[3200,800,940,240,740],
                         "protein":["12%","4%","16%","6%","10%"]})
menu = pd.DataFrame({"item":["pizza","pasta","salad","burritto","taco","burger"],
                    "price":[14.99,12.99,7.99,10.99,6.99,5.99],
                    "popularity":["high","medium","low","high","medium","high"]})
# Concat will just concatenate the values. We can pass multiple dataset
pd.concat([menu,nutrition],axis=1,ignore_index=False)
# merge took common column and print the output work on intersection can perform left, right, outer and inner. by default inneer
menu.merge(nutrition) 

menu.merge(nutrition,how="inner")
       item  price popularity  avg_calorie protein
0     pizza  14.99       high         3200     12%
1     pasta  12.99     medium          740     10%
2     salad   7.99        low          240      6%
3  burritto  10.99       high          940     16%

menu.merge(nutrition,how="outer")
       item  price popularity  avg_calorie protein
0     pizza  14.99       high       3200.0     12%
1     pasta  12.99     medium        740.0     10%
2     salad   7.99        low        240.0      6%
3  burritto  10.99       high        940.0     16%
4      taco   6.99     medium          NaN     NaN
5    burger   5.99       high          NaN     NaN
6    pastry    NaN        NaN        800.0      4%

menu.merge(nutrition,how="left")
       item  price popularity  avg_calorie protein
0     pizza  14.99       high       3200.0     12%
1     pasta  12.99     medium        740.0     10%
2     salad   7.99        low        240.0      6%
3  burritto  10.99       high        940.0     16%
4      taco   6.99     medium          NaN     NaN
5    burger   5.99       high          NaN     NaN

menu.merge(nutrition,how="right")
       item  price popularity  avg_calorie protein
0     pizza  14.99       high         3200     12%
1    pastry    NaN        NaN          800      4%
2  burritto  10.99       high          940     16%
3     salad   7.99        low          240      6%
4     pasta  12.99     medium          740     10%

# use left on and right on if there is no common columns.
menu.merge(nutrition,how="right",left_on="items",right_on="item")

# Join is very similar to vlookup by default left

menu.join(nutrition)
      items  price popularity      item  avg_calorie protein
0     pizza  14.99       high     pizza       3200.0     12%
1     pasta  12.99     medium    pastry        800.0      4%
2     salad   7.99        low  burritto        940.0     16%
3  burritto  10.99       high     salad        240.0      6%
4      taco   6.99     medium     pasta        740.0     10%
5    burger   5.99       high       NaN          NaN     NaN

# to perform default join properly use set_index. Otherwise output look like above.
menu.set_index('items').join(nutrition.set_index('item'))
          price popularity  avg_calorie protein
items                                          
pizza     14.99       high       3200.0     12%
pasta     12.99     medium        740.0     10%
salad      7.99        low        240.0      6%
burritto  10.99       high        940.0     16%
taco       6.99     medium          NaN     NaN
burger     5.99       high          NaN     NaN

How to check something is there in list or not

import pandas as pd
df = pd.DataFrame({'IDs':[1234,5346,1234,8793,8793],
                    'Names':['APPLE ABCD ONE','APPLE ABCD','NO STRAWBERRY YES','ORANGE AVAILABLE','TEA AVAILABLE']})

df
    IDs              Names
0  1234     APPLE ABCD ONE
1  5346         APPLE ABCD
2  1234  NO STRAWBERRY YES
3  8793   ORANGE AVAILABLE
4  8793      TEA AVAILABLE
lis = [ABCD, TEA]
df['Names'].apply(lambda x: any([k in x for k in lis]))
  
0     True
1     True
2    False
3    False
4     True
Name: Names, dtype: bool
df['FLAG']=df['Names'].apply(lambda x: any([k in x for k in lis]))
  
df
  
    IDs              Names   FLAG
0  1234     APPLE ABCD ONE   True
1  5346         APPLE ABCD   True
2  1234  NO STRAWBERRY YES  False
3  8793   ORANGE AVAILABLE  False
4  8793      TEA AVAILABLE   True

df['FLAG']=df['Names'].apply(lambda x: any([k in x for k in lis])).replace(True,'Fine').replace(False,'Not Fine')
df
  
    IDs              Names FLAG
0  1234     APPLE ABCD ONE  TDS
1  5346         APPLE ABCD  TDS
2  1234  NO STRAWBERRY YES   DB
3  8793   ORANGE AVAILABLE   DB
4  8793      TEA AVAILABLE  TDS

Check of above back

import pandas as pd

df = pd.read_excel(r'C:\HEB KPI\Book1.xlsx',header=0)
               
df['Report Detail [Table,filed & calculations]'] = df['Report Detail [Table,filed & calculations]'].str.split('\n')

df= df.explode('Report Detail [Table,filed & calculations]')
       
df.to_excel('C:\LOCATION\Testing_v0.xlsx',index=False)

lis = [ABCD, TEA]

df['FLAG']=df['Names'].apply(lambda x: any([k in x for k in lis])).replace(True,'TDS').replace(False,'DB')

df.to_excel('C:\Sample Data\Python1.xlsx',index=False)

df1['FLAG']=df1['Report Detail'].apply(lambda x: any([k in str(x) for k in lis])).replace(True,'TDS').replace(False,'DB')

Data Aggregation groupby

Excel approach- ALT + D + P + P to create a pivot table

# To get to count for all the columns
df.groupby('Region').count()
# To get sum of element in specific column
df.groupby(['Region'])['Profit'].sum()
Region
Central     39706.3625
East        91522.7800
South       46749.4303
West       108418.4489
Name: Profit, dtype: float64
# for grouping multiple value
df.groupby(['Region','Category'])['Profit'].sum()
Region   Category       
Central  Furniture          -2871.0494
         Office Supplies     8879.9799
         Technology         33697.4320
East     Furniture           3046.1658
         Office Supplies    41014.5791
         Technology         47462.0351
South    Furniture           6771.2061
         Office Supplies    19986.3928
         Technology         19991.8314
West     Furniture          11504.9503
         Office Supplies    52609.8490
         Technology         44303.6496
Name: Profit, dtype: float64
# to put some conditions while using group by like below code return abs mean
df.groupby(['Region','Category'])['Profit'].apply(lambda x: abs(x.mean()))

Assignment

import pandas as pd
import numpy as np
df = pd.read_excel(r'C:\Sample Data\commute.xlsx',header=0,index_col="Date")
df
           Subway  Taxi  Bus Ferry
Date                              
2020-06-01    Yes     2    7   Yes
2020-06-02    Yes     1    9   Yes
2020-06-03    Yes     8   10   Yes
2020-06-04     No     8    8   Yes
2020-06-05     No     4    7   Yes
...           ...   ...  ...   ...
2020-11-06     No     4    8    No
2020-11-07    Yes     1    7   Yes
2020-11-08    Yes     7    7   Yes
2020-11-09     No     8   13   Yes
2020-11-10    Yes     5    7    No

[163 rows x 4 columns]
df.replace(['Yes','No'],[1,0])
  
            Subway  Taxi  Bus  Ferry
Date                                
2020-06-01       1     2    7      1
2020-06-02       1     1    9      1
2020-06-03       1     8   10      1
2020-06-04       0     8    8      1
2020-06-05       0     4    7      1
...            ...   ...  ...    ...
2020-11-06       0     4    8      0
2020-11-07       1     1    7      1
2020-11-08       1     7    7      1
2020-11-09       0     8   13      1
2020-11-10       1     5    7      0

[163 rows x 4 columns]
s = np.array(df)
s
df=df.replace(['Yes','No'],[1,0])
s = np.array(df)
s
y = np.array([8,3,0.5,12])
y
array([ 8. ,  3. ,  0.5, 12. ])
s.dot(y)
array([29.5, 27.5, 49. , 40. , 27.5, 16.5, 30. , 38.5, 29. , 28. , 32. ,
       30. , 30. , 48. , 33. , 45. , 38. , 28. , 24.5, 14. , 31. , 39. ,
       38. , 24.5, 30. , 36.5, 39. , 48.5, 30.5, 28. , 32. , 33. , 41.5,
       26.5, 25. , 20.5, 16.5, 21.5, 30.5, 27.5, 45. , 13. , 34.5, 21. ,
       44. , 39. , 18.5, 36.5, 33.5, 36.5, 17. , 10. , 27. , 19. , 18.5,
       42. , 46.5, 48.5, 39. , 26. , 21.5, 19.5, 26. , 48.5, 29.5, 24. ,
       29. , 12. , 17.5, 31.5, 18. , 39.5, 17. , 29. , 22.5, 48. , 19.5,
       33. , 15.5, 23. , 31. , 15. , 22.5, 23.5, 25.5, 39. ,  6. , 28.5,
       42. , 21.5, 22.5, 46.5, 15.5, 21.5, 29. , 28. , 29. , 18. , 34. ,
       35. , 30.5, 23. , 22.5, 37. , 45.5, 32.5,  7.5, 21. ,  7. , 32.5,
       37. , 23. , 14.5, 39.5, 14.5, 35. , 22. , 18. , 33.5, 40. , 16.5,
       19.5, 39.5, 17. , 19.5, 28. , 43. , 24.5, 43. , 24. , 25.5, 40. ,
       13. , 25.5, 24. , 27. , 46. , 30.5, 21. , 15. , 26. , 24. , 27.5,
       30. , 48. , 50.5, 40.5, 38.5, 25. , 13.5, 18.5, 27.5, 28. , 27. ,
       20. , 18. , 13. , 43.5, 16. , 26.5, 44.5, 42.5, 26.5])
daily_expense=s.dot(y)
daily_expense.sum()
4683.0
daily_expense.argmax()
145
df.index[daily_expense.argmax()]
Timestamp('2020-10-24 00:00:00')

OS

import os
os.getcwd()
os.chdir('C:\\Sample Data\\')
os.rmdir('')
os.system('') # to open a file using python
os.path.join(cwd,'Sample Data','Folder 1') # we can define the path
import glob
# glob is to import multiple file like below * is wildcard for all
filenames = glob.glob('*')
# it means start with any ends with any but in between it must have 2016
filenames = glob.glob('*2016*')
# to get all file with any extension like .xlsx
glob.glob(cwd+'\\*\\*\\*.xlsx')

Tricks

lis = [1,2,3,4,5,6]
head,a,*tail = lis
print(head)
print(tail)

output
1 2
[3, 4, 5, 6]

lis = [1,2,3,4,5,6]
*head,tail = lis
print(head)
print(tail)

output
[1, 2, 3, 4, 5]
6

To list down all file in multiple folder according to there name pattern.

import os
import glob

os.chdir("D:\\Udemy\\Excel Automation\\assignment")

cwd = os.getcwd()

filenames = glob.glob("*.xlsx")

for file in filenames:
    year = file.split(".")[-2][-4:] # start from 2nd index in list and perform slicing
    try:
        int(year)
    except:
        continue
    if os.path.isdir(year) == False:
        os.mkdir(year)
    
    os.rename(file,os.path.join(cwd,year,file))

to get data from multiple source and merge it in one excel.

import os
import glob
import pandas as pd

os.chdir("D:\\Udemy\\Excel Automation\\assignment")

cwd = os.getcwd()

filenames = glob.glob(cwd+"\\*\\*xlsx")

consolidated = pd.DataFrame(columns = pd.read_excel(filenames[0]).columns)
for file in filenames:
    temp = pd.read_excel(file)
    consolidated = consolidated.append(temp, ignore_index = True)
    
consolidated.to_excel("consolidated.xlsx", index = False)

To delete a specific row with specific value from excel

df = pd.DataFrame(mydataset)
df
    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2

i = df[df.cars=='Volvo'].index
i
Index([1], dtype='int64')
df.drop(i)
   cars  passings
0   BMW         3
2  Ford         2

Distinct or unique to get unique value.

df.cars.unique()
array(['BMW', 'Volvo', 'Ford'], dtype=object)
#or
np.unique(df[['col1','col2']], axis=0)
#
df.nunique()
cars        3
passings    3
dtype: int64

to drop duplicates

df.drop_duplicates()

VLOOKUP in python

import os
import pandas as pd

os.chdir("D:\\Udemy\\Excel Automation")

sales = pd.read_excel("sales_data.xlsx") #importing left dataframe

zip_income = pd.read_csv("zipcode_income.csv", engine='python') # importing right dataframe

temp = sales.merge(zip_income.loc[:,["Zip_Code","Mean"]].rename(columns={"Zip_Code":"Postal Code", "Mean":"Mean Income"})
                   ,how="left",on="Postal Code") # merging the left df with relevant columns of the right df

temp.drop_duplicates(subset=["Row ID"], keep ="first", inplace=True) # dropping duplicates

IF condition

# excel
=IF(O2="Bookcases",U2*0.8,IF(O2="Chairs",U2*10))
# Python
import numpy as np
df["Profit Percent"]=np.where(df['Category']=='Furniture',df['Per Units Sales']*0.5,np.where(df['Category']=='Office Supplies',df['Per Units Sales']*0.7,0))
#or
df['visits_category'] = np.where(df['visits_30days']== 0, 'YES', 'NO')

Some more operations

#Ordered dataframe
df['Order ID']
0       CA-2016-152156
1       CA-2016-152156
2       CA-2016-138688
3       US-2015-108966
4       US-2015-108966
             ...      
9989    CA-2014-110422
9990    CA-2017-121258
9991    CA-2017-121258
9992    CA-2017-121258
9993    CA-2017-119914
Name: Order ID, Length: 9994, dtype: object

Slicing

df['Order ID'].str[3:7]
0       2016
1       2016
2       2016
3       2015
4       2015
        ... 
9989    2014
9990    2017
9991    2017
9992    2017
9993    2017
Name: Order ID, Length: 9994, dtype: object

df['Order ID'].str[-7:]
0       -152156
1       -152156
2       -138688
3       -108966
4       -108966
         ...   
9989    -110422
9990    -121258
9991    -121258
9992    -121258
9993    -119914
Name: Order ID, Length: 9994, dtype: object

df['Order ID'].str[:2]
0       CA
1       CA
2       CA
3       US
4       US
        ..
9989    CA
9990    CA
9991    CA
9992    CA
9993    CA
Name: Order ID, Length: 9994, dtype: object

Split

df['Order ID'].str.split('-')
0       [CA, 2016, 152156]
1       [CA, 2016, 152156]
2       [CA, 2016, 138688]
3       [US, 2015, 108966]
4       [US, 2015, 108966]
               ...        
9989    [CA, 2014, 110422]
9990    [CA, 2017, 121258]
9991    [CA, 2017, 121258]
9992    [CA, 2017, 121258]
9993    [CA, 2017, 119914]
Name: Order ID, Length: 9994, dtype: object

Split indexing

df['Order ID'].str.split('-')[1]
['CA', '2016', '152156]

Removing extra spaces trim

df['Order ID'].str.strip()
0       CA-2016-152156
1       CA-2016-152156
2       CA-2016-138688
3       US-2015-108966
4       US-2015-108966
             ...      
9989    CA-2014-110422
9990    CA-2017-121258
9991    CA-2017-121258
9992    CA-2017-121258
9993    CA-2017-119914
Name: Order ID, Length: 9994, dtype: object

Concatenate

df['location'] = df['Country']+"_"+df["City"]

Upper and lower

df['Category'].str.upper()
   
0             FURNITURE
1             FURNITURE
2       OFFICE SUPPLIES
3             FURNITURE
4       OFFICE SUPPLIES
             ...       
9989          FURNITURE
9990          FURNITURE
9991         TECHNOLOGY
9992    OFFICE SUPPLIES
9993    OFFICE SUPPLIES
Name: Category, Length: 9994, dtype: object

df['Category'].str.lower()
   
0             furniture
1             furniture
2       office supplies
3             furniture
4       office supplies
             ...       
9989          furniture
9990          furniture
9991         technology
9992    office supplies
9993    office supplies
Name: Category, Length: 9994, dtype: object

Find

df['Category'].str.find('Fur')
   
0       0
1       0
2      -1
3       0
4      -1
       ..
9989    0
9990    0
9991   -1
9992   -1
9993   -1
Name: Category, Length: 9994, dtype: int64

df['Category'].str.find('Fur').replace(-1,'Not Found')
   
0               0
1               0
2       Not Found
3               0
4       Not Found
          ...    
9989            0
9990            0
9991    Not Found
9992    Not Found
9993    Not Found
Name: Category, Length: 9994, dtype: object

With conditions

#Dataframe to excel
import os
import pandas as pd

os.chdir("D:\\Udemy\\Excel Automation") # changes working directory

sales_data = pd.read_excel("sales_data.xlsx")

countif

len(sales_data[sales_data["Quantity"] > 5])
sales_data[sales_data["Quantity"] > 5].shape[0]

countifs

len(sales_data[(sales_data["State"]=="Kentucky") & (sales_data["Quantity"] > 5)])
sales_data[(sales_data["State"]=="Kentucky") & (sales_data["Quantity"] > 5)].shape[0]

sumif

sales_data[sales_data["City"].str[:4]=="Fort"]["Profit"].sum()
sales_data.loc[sales_data["City"].str[:4]=="Fort","Profit"].sum()

sumifs

sales_data[(sales_data["City"].str[:4]=="Fort") & (sales_data["Quantity"] > 5)]["Profit"].sum()
sales_data.loc[(sales_data["City"].str[:4]=="Fort") & (sales_data["Quantity"] > 5),"Profit"].sum()

loc & iloc click here

df.loc[0]
Row ID                                             1
Order ID                              CA-2016-152156
Order Date                       2016-11-08 00:00:00
Ship Date                        2016-11-11 00:00:00
Ship Mode                               Second Class
Customer ID                                 CG-12520
Customer Name                            Claire Gute
Segment                                     Consumer
Country                                United States
City                                       Henderson
Postal Code                                    42420
Region                                         South
Product ID                           FUR-BO-10001798
Category                                   Furniture
Sub-Category                               Bookcases
Product Name       Bush Somerset Collection Bookcase
Sales                                         261.96
Quantity                                           2
Discount                                         0.0
Profit                                       41.9136
Per Units Sales                               130.98

df.loc[0:10]
    Row ID        Order ID Order Date  ... Discount    Profit Per Units Sales
0        1  CA-2016-152156 2016-11-08  ...     0.00   41.9136        130.9800
1        2  CA-2016-152156 2016-11-08  ...     0.00  219.5820        243.9800
2        3  CA-2016-138688 2016-06-12  ...     0.00    6.8714          7.3100
3        4  US-2015-108966 2015-10-11  ...     0.45 -383.0310        191.5155
4        5  US-2015-108966 2015-10-11  ...     0.20    2.5164         11.1840
5        6  CA-2014-115812 2014-06-09  ...     0.00   14.1694          6.9800
6        7  CA-2014-115812 2014-06-09  ...     0.00    1.9656          1.8200
7        8  CA-2014-115812 2014-06-09  ...     0.20   90.7152        151.1920
8        9  CA-2014-115812 2014-06-09  ...     0.20    5.7825          6.1680
9       10  CA-2014-115812 2014-06-09  ...     0.00   34.4700         22.9800
10      11  CA-2014-115812 2014-06-09  ...     0.20   85.3092        189.5760

df.iloc[[1,4]]
   Row ID        Order ID Order Date  ... Discount    Profit Per Units Sales
1       2  CA-2016-152156 2016-11-08  ...      0.0  219.5820         243.980
4       5  US-2015-108966 2015-10-11  ...      0.2    2.5164          11.184

# choosing specific column till certain range
df.iloc[1:6,[1,4]]
         Order ID       Ship Mode
1  CA-2016-152156    Second Class
2  CA-2016-138688    Second Class
3  US-2015-108966  Standard Class
4  US-2015-108966  Standard Class
5  CA-2014-115812  Standard Class

# choosing n row with slicing on column
df.iloc[1:10,2:]
  Order Date  Ship Date       Ship Mode  ... Discount    Profit Per Units Sales
1 2016-11-08 2016-11-11    Second Class  ...     0.00  219.5820        243.9800
2 2016-06-12 2016-06-16    Second Class  ...     0.00    6.8714          7.3100
3 2015-10-11 2015-10-18  Standard Class  ...     0.45 -383.0310        191.5155
4 2015-10-11 2015-10-18  Standard Class  ...     0.20    2.5164         11.1840
5 2014-06-09 2014-06-14  Standard Class  ...     0.00   14.1694          6.9800
6 2014-06-09 2014-06-14  Standard Class  ...     0.00    1.9656          1.8200
7 2014-06-09 2014-06-14  Standard Class  ...     0.20   90.7152        151.1920
8 2014-06-09 2014-06-14  Standard Class  ...     0.20    5.7825          6.1680
9 2014-06-09 2014-06-14  Standard Class  ...     0.00   34.4700         22.9800

# some condition
df.loc[df['Ship Mode']=='Second Class']
      Row ID        Order ID Order Date  ... Discount    Profit Per Units Sales
0          1  CA-2016-152156 2016-11-08  ...      0.0   41.9136         130.980
1          2  CA-2016-152156 2016-11-08  ...      0.0  219.5820         243.980
2          3  CA-2016-138688 2016-06-12  ...      0.0    6.8714           7.310
17        18  CA-2014-167164 2014-05-13  ...      0.0    9.9900          27.750
18        19  CA-2014-143336 2014-08-27  ...      0.0    2.4824           4.280
...      ...             ...        ...  ...      ...       ...             ...
9965    9966  CA-2016-146374 2016-12-05  ...      0.0    2.3406           4.980
9966    9967  CA-2016-146374 2016-12-05  ...      0.0   51.5543          15.670
9980    9981  US-2015-151435 2015-09-06  ...      0.0   22.3548          85.980
9989    9990  CA-2014-110422 2014-01-21  ...      0.2    4.1028           8.416

# with 2 condition and(&)
df.loc[(df['Region']=='South') & (df['Category']=='Furniture')]

# with 2 condition or(|)
df.loc[(df['Region']=='South') | (df['Category']=='Furniture')]

Matplotlib

import matplotlib.pyplot as plt
import numpy as np
flg = plt.figure()
# it means we are creating axis starting from 0 to 1
ax = flg.add_axes([0,0,1,1])
x=np.arange(1,10)
# we can see a simple plot by below code.
ax.plot(x,x**2)

Sort values

df.loc[:,['Profit','Order ID']].sort_values(by=['Order ID'])
        Profit        Order ID
2717  109.6113  CA-2014-100006
6288   68.8464  CA-2014-100090
6287  -87.9354  CA-2014-100090
9514   31.8696  CA-2014-100293
3083    1.3257  CA-2014-100328
...        ...             ...
5932    1.0044  US-2017-169551
5933    4.8231  US-2017-169551
5934   85.7870  US-2017-169551
5929  -45.3492  US-2017-169551
5931 -113.9980  US-2017-169551

df.loc[:,['Profit','Order ID']].sort_values(by=['Order ID'],ascending=True)
        Profit        Order ID
2717  109.6113  CA-2014-100006
6288   68.8464  CA-2014-100090
6287  -87.9354  CA-2014-100090
9514   31.8696  CA-2014-100293
3083    1.3257  CA-2014-100328
...        ...             ...
5932    1.0044  US-2017-169551
5933    4.8231  US-2017-169551
5934   85.7870  US-2017-169551
5929  -45.3492  US-2017-169551
5931 -113.9980  US-2017-169551

[9994 rows x 2 columns]

df.loc[:,['Profit','Order ID']].sort_values(by=['Order ID'],ascending=False)
        Profit        Order ID
5934   85.7870  US-2017-169551
5933    4.8231  US-2017-169551
5930    5.4432  US-2017-169551
5931 -113.9980  US-2017-169551
5932    1.0044  US-2017-169551
...        ...             ...
3083    1.3257  CA-2014-100328
9514   31.8696  CA-2014-100293
6287  -87.9354  CA-2014-100090
6288   68.8464  CA-2014-100090
2717  109.6113  CA-2014-100006

Drop Duplicates

df.loc[:,['Profit','Order ID']].drop_duplicates()

Libraries required for all below

import pandas as pd
import os 
import numpy as np
import glob
import subprocess,openpyxl
from openpyxl import load_workbook
import re
from zipfile import ZipFile
import os

Merging data of whole folder excel into one excel

all_data = pd.DataFrame()
# Specify the folder containing your Excel files
folder_path = "C:\\\Python Merge" 
file_list = glob.glob(folder_path + "/*.xlsx")
count = 0
 
for i in file_list: 
    print(i)
    df = pd.read_excel(i, header=0, sheet_name='Attribute Level Metadata')
    print('after if')
    all_data = all_data._union(df, ignore_index=True)
    print('all data')
    count += 1
print('outside loop')
output_file_path = f"{folder_path}\\final.xlsx"
all_data.to_excel(output_file_path, sheet_name='Sheet1', index=False)
 
print("Data merged and saved to", output_file_path)

Column wise split of data into multiple excel

#original
package='Meta Split'
data_df = pd.read_excel(r"C:\package15.xlsx",header=0,sheet_name='Attribute Level Metadata')
parent = 'C:\\Loc\\pack\\'
path=os.path.join(parent,package)
os.mkdir(path)
grouped_df = data_df.groupby('SPECIFICATION_PACKAGE')
for data in grouped_df:
    grouped_df.get_group(data[0]).to_excel(f"Loc\\pack\\{package}\\"+data[0]+".xlsx",index=False)

Processing Jars using python

folder = 'C:\\Python Dup\\'
jarFile = f'{folder}Jar File'
base = f'{folder}excel\\template.xlsx'
file_list = glob.glob(folder + "/*.zip")
for i in range(len(file_list)):
    data = file_list[i].split('.')[0]
    wb=openpyxl.load_workbook(base)
    output_excel = f'{data}.xlsx'
    wb.save(output_excel)
    print(output_excel)
    try:
        cmd = f'cmd /c java -jar "{jarFile}" "{file_list[i]}" "{output_excel}"'
        subprocess.run(cmd,shell=True,check=True)
    except Exception as e:
        print(f'{data} {e}')

Zipping multiple files in one go

import zipfile
import os
 
def zip_files_in_folder(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".xml"):
            file_name_1 = file_name.split('.')[0]
            file_path = os.path.join(folder_path, file_name)
            zip_file_path = os.path.join(folder_path, f"{file_name_1}.zip")
 
            with zipfile.ZipFile(zip_file_path, 'w',zipfile.ZIP_DEFLATED) as zip_file:
                zip_file.write(file_path, file_name)
 
if __name__ == "__main__":
    folder_path = "C:\\Python Dup"  # Replace with your actual folder path
    zip_files_in_folder(folder_path)

Comparing 2 folder and append the data if not exist in 2nd folder if exist it append

# package='Meta Split_4'
# path=os.path.join("C:\\Merge",package)
# os.mkdir(path)
# file_list = os.listdir(folder_path)
# file_list_1 = os.listdir(folder_path_1)
folder_path = "C:\\Meta\\Meta Split_2"
file_list = glob.glob(folder_path + "/*.xlsx")
folder_path_1 = "C:\\Meta\\Meta Split_3"
file_list_1 = glob.glob(folder_path_1 + "/*.xlsx")
count=0
for i in file_list:
    data1 = i.split('\\')[-1].strip('.xlsx')
    for j in file_list_1:
        data2 = j.split('\\')[-1].strip('.xlsx')
        if data1==data2:
            count+=1
            try:
                df1 = pd.read_excel(i, header=0)
                df2 = pd.read_excel(j, header=0)
                df3 = df1._append(df2, ignore_index=True)
                df3= df3.drop_duplicates()
                path = f"C:\\Meta\\Meta Split_4\\{data2}.xlsx"
                df3.to_excel(path, sheet_name='Sheet1', index=False)
            except Exception as e:
                print(e)
        else:
            print("else"+data2)
print(count)

Get non distinct value in python

df1=df[df.duplicated(subset=['COL_NAME','ATTRIBUTE_NAME',keep=False)].drop_duplicates().sort_values('COL_NAME')

Count

df2 = pd.DataFrame(columns=['Package','Report_Count','Required Report'])
file_list_1 = glob.glob('C:\dump' + "/*.xlsm")
for i in file_list_1:
    print(i)
    file_name = i.split('\\')[-1]
    file_name = re.sub('.xlsm','',file_name)
    wb = openpyxl.load_workbook(i,read_only=True,keep_vba=True)
    sheet = wb['Summary']
    c7 = sheet['C7'].value
    c9 = sheet['C9'].value
    df2 = df2._append({'Package':file_name,'Report_Count':c7,'Required Report':c9},ignore_index=True)
df2.to_excel(r'C:\\Report Count Sheet\\count_9.xlsx',index=False)

Report Duplicate handle

all_data = pd.DataFrame()
 
# Specify the folder containing your Excel files
folder_path = "C:\\testing 2.0"
file_list = glob.glob(folder_path + "/*.xlsx")
count = 0

for i in file_list: 
    print(i)
    df = pd.read_excel(i, header=0, sheet_name='Attribute Level Metadata')
    # print('after if')
    # print(len(df))
    # #print(df.columns.values)
    # print("Report Name Unique Count : ",len(df['COL_NAME'].unique()))
    # print("Report Description Unique Count : ",len(df['REPORT_DESCRIPTION'].unique()))
    all_data=all_data.drop_duplicates()
    df = df.drop_duplicates()
    if 'COL_NAME' in all_data.columns:
        # Update report names with the appropriate suffix for instances in the second file
        COL_NAMEs_set = set(all_data['COL_NAME'])
    # Update report names with the appropriate suffix
        for index, row in df.iterrows():
            COL_NAME = row['COL_NAME']
            if COL_NAME in COL_NAMEs_set:
                # suffix_count = list(df['COL_NAME']).count(COL_NAME)
                # suffix = '_1' if suffix_count==1 else f"_{suffix_count+1}"
                df.at[index, 'COL_NAME'] += f"_1"
                count=2
                while True:
                    if df.at[index, 'COL_NAME'] in COL_NAMEs_set:
                        df.at[index, 'COL_NAME'] += f"_{count}"
                        print(df.at[index, 'COL_NAME'])
                        count+=1
                    else:
                        break
    all_data = all_data._append(df, ignore_index=True)
    # print('all data')
    # print(len(all_data))
print('outside loop')
#output_file_path = f"{folder_path}\\final.xlsx"
output_file_path = f"{folder_path}"
print("Unique COL_NAME in final package : ",len(all_data['COL_NAME'].unique()))
print("Unique REPORT_DESCRIPTION in final package : ",len(all_data['REPORT_DESCRIPTION'].unique()))
# row_limit = 1000000
# with pd.ExcelWriter(output_file_path, engine='xlsxwriter') as writer:
#     start_row=0
#     for i in range(0, len(all_data), row_limit):
#         df_chunk = all_data.iloc[i:i + row_limit]
#         sheet_name = f'Sheet{i // row_limit + 1}'
#         df_chunk.to_excel(writer, sheet_name=sheet_name, startrow=start_row, index=False, header=True)
#         start_row += len(df_chunk) + 1

rows_per_file = 1000000
number_of_files = ((len(all_data)//rows_per_file))+1
start_index=0
end_index = rows_per_file
for i in range(number_of_files):
    filepart = f'{folder_path}\\final_{i}.xlsx'
    writer = pd.ExcelWriter(filepart)
    df_mod = all_data.iloc[start_index:end_index]
    df_mod.to_excel(writer, index=False, sheet_name='sheet')
    start_index = end_index
    end_index = end_index + rows_per_file
    writer.close()
print("Data merged and saved to", output_file_path)

Updated above code to track the changes

all_data = pd.DataFrame()
 
# Specify the folder containing your Excel files
folder_path = "C:\\Testing New"
file_list = glob.glob(folder_path + "/*.xlsx")
count = 0
df2 = pd.DataFrame(columns=['Package','Orig_COL_NAME','New_COL_NAME','Report_Desc','Specification_Pack'])
for i in file_list: 
    print(i)
    df = pd.read_excel(i, header=0, sheet_name='Attribute Level Metadata')
    # print('after if')
    # print(len(df))
    # #print(df.columns.values)
    # print("Report Name Unique Count : ",len(df['COL_NAME'].unique()))
    # print("Report Description Unique Count : ",len(df['REPORT_DESCRIPTION'].unique()))
    all_data=all_data.drop_duplicates()
    df = df.drop_duplicates()
    file_name = i.split('\\')[-1]
    file_name = re.sub('.xlsx','',file_name)
    if 'COL_NAME' in all_data.columns:
        # Update report names with the appropriate suffix for instances in the second file
        COL_NAMEs_set = set(all_data['COL_NAME'])
    # Update report names with the appropriate suffix
        for index, row in df.iterrows():
            COL_NAME = row['COL_NAME']
            if COL_NAME in COL_NAMEs_set:
                # suffix_count = list(df['COL_NAME']).count(COL_NAME)
                # suffix = '_1' if suffix_count==1 else f"_{suffix_count+1}"
                # rep_var = df.at[index, 'COL_NAME']
                #df.at[index, 'COL_NAME'] += f"_1"
                # nrep_var = df.at[index, 'COL_NAME']
                # report_desc = df.at[index,'REPORT_DESCRIPTION']
                # attribute_agg = df.at[index,'ATTRIBUTE_AGGREGATION']
                #df2 = df2._append({'Package':file_name,'Orig_COL_NAME':rep_var,'New_COL_NAME':nrep_var,'Specification_Pack':attribute_agg},ignore_index=True)
                count=1
                while True:
                    if df.at[index, 'COL_NAME'] in COL_NAMEs_set and count>1:
                        df.at[index, 'COL_NAME']= df.at[index, 'COL_NAME'][:-2]+f"_{count}"
                        # df.at[index, 'COL_NAME']+= f"_{count}"
                        print('count 2',df.at[index, 'COL_NAME'])
                        rep_var = df.at[index, 'COL_NAME']
                        nrep_var = df.at[index, 'COL_NAME']
                        report_desc = df.at[index,'REPORT_DESCRIPTION']
                        attribute_agg = df.at[index,'ATTRIBUTE_AGGREGATION']
                        df2 = df2._append({'Package':file_name,'Orig_COL_NAME':rep_var,'New_COL_NAME':nrep_var,'Report_Desc':report_desc,'Specification_Pack':attribute_agg},ignore_index=True)
                        count+=1
                    elif df.at[index, 'COL_NAME'] in COL_NAMEs_set and count==1:
                        df.at[index, 'COL_NAME'] += f"_1"
                        print('count 1',df.at[index, 'COL_NAME'])
                        nrep_var = df.at[index, 'COL_NAME']
                        nrep_var = df.at[index, 'COL_NAME']
                        report_desc = df.at[index,'REPORT_DESCRIPTION']
                        attribute_agg = df.at[index,'ATTRIBUTE_AGGREGATION']
                        df2 = df2._append({'Package':file_name,'Orig_COL_NAME':rep_var,'New_COL_NAME':nrep_var,'Report_Desc':report_desc,'Specification_Pack':attribute_agg},ignore_index=True)
                        count+=1
                    else:
                        break
    all_data = all_data._append(df, ignore_index=True)
    # print('all data')
    # print(len(all_data))
print('outside loop')
#output_file_path = f"{folder_path}\\final.xlsx"
output_file_path = f"{folder_path}"
print("Unique COL_NAME in final package : ",len(all_data['COL_NAME'].unique()))
print("Unique REPORT_DESCRIPTION in final package : ",len(all_data['REPORT_DESCRIPTION'].unique()))
df2.to_excel(r'C:\\Testing New\change.xlsx',index=False)
print('change sheet generated')
rows_per_file = 1000000
number_of_files = ((len(all_data)//rows_per_file))+1
start_index=0
end_index = rows_per_file
for i in range(number_of_files):
    filepart = f'{folder_path}\\final_{i}.xlsx'
    writer = pd.ExcelWriter(filepart)
    df_mod = all_data.iloc[start_index:end_index]
    df_mod.to_excel(writer, index=False, sheet_name='sheet')
    start_index = end_index
    end_index = end_index + rows_per_file
    writer.close()
print("Data merged and saved to", output_file_path)

Unzipping group of file by passing folder location

import zipfile
import os
def zip_files_in_folder(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".zip"):
            file_name_1 = file_name.split('.')[0]
            file_path = os.path.join(folder_path, file_name)
            zip_file_path = os.path.join(folder_path, f"{file_name_1}.zip")
            with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
                zip_file.extractall(folder_path)
 
if __name__ == "__main__":
    folder_path = "C:\\miss"  # Replace with your actual folder path
    zip_files_in_folder(folder_path)

Extracting data(Schema) from specific file type like sql

import os
import glob
import re
import pandas as pd
folder = 'C:\\Scripts\\'
# file_list = glob.glob(folder + "/*.type")
# for i in file_list:
#     f = open(i, 'r')
#     srclist = f.readlines()
df = pd.DataFrame(columns=['Script Name','Schema Name','Table/View Name'])
file_list = glob.glob(folder + "/*.type")
for k in file_list:
    f = open(k, 'r')
    file_name = k.split('\\')[-1]
    file_name = re.sub('.btq','',file_name)
    srclist = f.readlines()
    lis = ['from','FROM','INSERT','delete','DELETE','insert','UPDATE','update','join','JOIN']
    flag = 0
    for i in range(len(srclist)):
        line = srclist[i].strip()
        found = [line for value in lis if str(value) in line]
        if len(found)>0:
            for j in found:
                pattern = re.compile(r'\b\w+\.\w+\b')
                result = re.search(pattern,j)
                if result!=None:
                    var = result.group()
                    schema = var.split('.')[0].strip()
                    table = var.split('.')[-1].strip()
                    df = df._append({'Script Name':file_name,'Schema Name':schema,'Table/View Name':table},ignore_index=True)
df.to_excel(r'C:\\BTEQ_Scripts\\output2.xlsx',index=False)

To get a list of file inside multiple dir

path = 'C:\\folder'
for root,dirs,files in os.walk(path):
    # print("\nroot "+str(root))
    # print("dirs "+str(dirs))
    # print("file "+str(file))
    lis = ['sample_dir']
    if any(folder in root for folder in lis):
        for file in files: 
            if file.endswith('.csv'):
                file_path = os.path.join(root,file)
                print(file_path)

Type listing in python

from typing import List
def lis_avg(seq: List) -> float:
    return sum(seq)/len(seq)
print(lis_avg(1,2,3))

Insert multiple header & insert data into given column

workbook = openpyxl.Workbook()
sheet = workbook.active
headers = ['A','B','C','D']

for col_num, header in enumerate(headers, start=1):
    sheet.cell(row=1, column=col_num, value=header)
for i, (value_A, value_B, value_D) in enumerate(zip(list_A, list_B, list_D), start=2):
    sheet.cell(row=i, column=1, value=value_A)  
    sheet.cell(row=i, column=2, value=value_B)  
    sheet.cell(row=i, column=4, value=value_D)  

file_name = 'data_with_columns.xlsx'
folder_loc = 'C:\\folder\\'
workbook.save(f'{folder_loc}{file_name}')
print(f"Data written to {file_name}")

About


Languages

Language:Jupyter Notebook 67.2%Language:Python 32.8%