Pandas Basics

# !pip install pandas

import pandas as pd

df = pd.read_csv('nyc_weather.csv')
df
type(df)

pandas.core.frame.DataFrame

df['Temperature']

df[['Temperature']]

type(df[['Temperature']])

pandas.core.frame.DataFrame

type(df['Temperature'])  # 1d values or single column

pandas.core.series.Series

df['Temperature'][0]

38

type(df['Temperature'][0]) # 1 byte = 8 bits | 8 bytes = 64 bits

numpy.int64

df.Temperature

df['Temperature'].max()

50

df['Temperature'].mean()

34.67741935483871

import numpy as np

np.NaN

nan

16 + np.NaN

nan

2 - DataFrame Properties

df.shape

(31, 11)

df.size

341

df.ndim

2

3 - Slicing

df.head()

        EST  Temperature  DewPoint  Humidity  Sea Level PressureIn  \\\\
0  1/1/2016           38        23        52                 30.03
1  1/2/2016           36        18        46                 30.02
2  1/3/2016           40        21        47                 29.86
3  1/4/2016           25         9        44                 30.05
4  1/5/2016           20        -3        41                 30.57

   VisibilityMiles  WindSpeedMPH PrecipitationIn  CloudCover Events  \\\\
0               10           8.0               0           5    NaN
1               10           7.0               0           3    NaN
2               10           8.0               0           1    NaN
3               10           9.0               0           3    NaN
4               10           5.0               0           0    NaN

   WindDirDegrees
0             281
1             275
2             277
3             345
4             333

df.head(3)

        EST  Temperature  DewPoint  Humidity  Sea Level PressureIn  \\\\
0  1/1/2016           38        23        52                 30.03
1  1/2/2016           36        18        46                 30.02
2  1/3/2016           40        21        47                 29.86

   VisibilityMiles  WindSpeedMPH PrecipitationIn  CloudCover Events  \\\\
0               10           8.0               0           5    NaN
1               10           7.0               0           3    NaN
2               10           8.0               0           1    NaN

   WindDirDegrees
0             281
1             275
2             277

df.tail()

          EST  Temperature  DewPoint  Humidity  Sea Level PressureIn  \\\\
26  1/27/2016           41        22        45                 30.03
27  1/28/2016           37        20        51                 29.90
28  1/29/2016           36        21        50                 29.58
29  1/30/2016           34        16        46                 30.01
30  1/31/2016           46        28        52                 29.90

    VisibilityMiles  WindSpeedMPH PrecipitationIn  CloudCover Events  \\\\
26               10           7.0               T           3   Rain
27               10           5.0               0           1    NaN
28               10           8.0               0           4    NaN
29               10           7.0               0           0    NaN
30               10           5.0               0           0    NaN

    WindDirDegrees
26             311
27             234
28             298
29             257
30             241

df[1:6]

        EST  Temperature  DewPoint  Humidity  Sea Level PressureIn  \\\\
1  1/2/2016           36        18        46                 30.02
2  1/3/2016           40        21        47                 29.86
3  1/4/2016           25         9        44                 30.05
4  1/5/2016           20        -3        41                 30.57
5  1/6/2016           33         4        35                 30.50

   VisibilityMiles  WindSpeedMPH PrecipitationIn  CloudCover Events  \\\\
1               10           7.0               0           3    NaN
2               10           8.0               0           1    NaN
3               10           9.0               0           3    NaN
4               10           5.0               0           0    NaN
5               10           4.0               0           0    NaN

   WindDirDegrees
1             275
2             277
3             345
4             333
5             259

df[5:11][['Temperature', 'Humidity']]

    Temperature  Humidity
5            33        35
6            39        33
7            39        64
8            44        77
9            50        71
10           33        37

df[['Temperature','Humidity']][ df['Temperature'] == 30 ]

    Temperature  Humidity
13           30        47

df[['Temperature','Humidity']][ (df['Temperature'] == 30) & (df['Humidity'] == 47) ]

    Temperature  Humidity
13           30        47

df[ (df['Temperature'] == 30) & (df['Humidity'] == 47) ]

          EST  Temperature  DewPoint  Humidity  Sea Level PressureIn  \\\\
13  1/14/2016           30        12        47                 29.95

    VisibilityMiles  WindSpeedMPH PrecipitationIn  CloudCover Events  \\\\
13               10           5.0               T           7    NaN

    WindDirDegrees
13             266

4 - Columns

df.columns

Index(['EST', 'Temperature', 'DewPoint', 'Humidity', 'Sea Level PressureIn',
       'VisibilityMiles', 'WindSpeedMPH', 'PrecipitationIn', 'CloudCover',
       'Events', 'WindDirDegrees'],
      dtype='object')

df['EST']

0      1/1/2016
1      1/2/2016
2      1/3/2016
3      1/4/2016
4      1/5/2016
5      1/6/2016
6      1/7/2016
7      1/8/2016
8      1/9/2016
9     1/10/2016
10    1/11/2016
11    1/12/2016
12    1/13/2016
13    1/14/2016
14    1/15/2016
15    1/16/2016
16    1/17/2016
17    1/18/2016
18    1/19/2016
19    1/20/2016
20    1/21/2016
21    1/22/2016
22    1/23/2016
23    1/24/2016
24    1/25/2016
25    1/26/2016
26    1/27/2016
27    1/28/2016
28    1/29/2016
29    1/30/2016
30    1/31/2016
Name: EST, dtype: object

type(df['EST'][0])

str

5 - Operations

weather_data = {
    'day': ['1/1/2023', '1/2/2023', '1/3/2023', '1/4/2023','1/5/2023', '1/6/2023'],
    'temperature': [32,35,28,24,32,38],
    'windspeed': [6,7,8,2,4,3],
    'event': ['Rain', 'Sunny', 'Snow', 'Rain', 'Sunny', 'Snow']
}

df = pd.DataFrame(weather_data)
df

        day  temperature  windspeed  event
0  1/1/2023           32          6   Rain
1  1/2/2023           35          7  Sunny
2  1/3/2023           28          8   Snow
3  1/4/2023           24          2   Rain
4  1/5/2023           32          4  Sunny
5  1/6/2023           38          3   Snow

df.set_index('day')

          temperature  windspeed  event
day
1/1/2023           32          6   Rain
1/2/2023           35          7  Sunny
1/3/2023           28          8   Snow
1/4/2023           24          2   Rain
1/5/2023           32          4  Sunny
1/6/2023           38          3   Snow

df

        day  temperature  windspeed  event
0  1/1/2023           32          6   Rain
1  1/2/2023           35          7  Sunny
2  1/3/2023           28          8   Snow
3  1/4/2023           24          2   Rain
4  1/5/2023           32          4  Sunny
5  1/6/2023           38          3   Snow

df.set_index('day', inplace = True)
df

          temperature  windspeed  event
day
1/1/2023           32          6   Rain
1/2/2023           35          7  Sunny
1/3/2023           28          8   Snow
1/4/2023           24          2   Rain
1/5/2023           32          4  Sunny
1/6/2023           38          3   Snow

df.columns

Index(['temperature', 'windspeed', 'event'], dtype='object')

df.index

Index(['1/1/2023', '1/2/2023', '1/3/2023', '1/4/2023', '1/5/2023', '1/6/2023'], dtype='object', name='day')

df.loc['1/2/2023']

temperature       35
windspeed          7
event          Sunny
Name: 1/2/2023, dtype: object

type(df.loc['1/2/2023'])

pandas.core.series.Series

df.reset_index(inplace = True)
df

        day  temperature  windspeed  event
0  1/1/2023           32          6   Rain
1  1/2/2023           35          7  Sunny
2  1/3/2023           28          8   Snow
3  1/4/2023           24          2   Rain
4  1/5/2023           32          4  Sunny
5  1/6/2023           38          3   Snow

df.set_index('event', inplace = True)
df

            day  temperature  windspeed
event
Rain   1/1/2023           32          6
Sunny  1/2/2023           35          7
Snow   1/3/2023           28          8
Rain   1/4/2023           24          2
Sunny  1/5/2023           32          4
Snow   1/6/2023           38          3

df.loc['Snow']

            day  temperature  windspeed
event
Snow   1/3/2023           28          8
Snow   1/6/2023           38          3

df.reset_index(inplace = True)

6 - Different ways to create a dataframe

using excel

df = pd.read_excel('weather_data.xlsx', 'Sheet1')

df

         day  temperature  windspeed  event
0 2017-01-01           32          6   Rain
1 2017-01-02           35          7  Sunny
2 2017-01-03           28          2   Snow

using list and tuple

weather_data = [
    ('1/1/2023', 32, 6, 'Rain'),
    ('1/2/2023', 35, 6, 'Snow'),
    ('1/3/2023', 30, 6, 'Sunny')
]

df = pd.DataFrame(weather_data)
df

          0   1  2      3
0  1/1/2023  32  6   Rain
1  1/2/2023  35  6   Snow
2  1/3/2023  30  6  Sunny

weather_data = [
    ('1/1/2023', 32, 6, 'Rain'),
    ('1/2/2023', 35, 6, 'Snow'),
    ('1/3/2023', 30, 6, 'Sunny')
]

df = pd.DataFrame(weather_data, columns=['day', 'temperature', 'windspeed', 'event'])
df

        day  temperature  windspeed  event
0  1/1/2023           32          6   Rain
1  1/2/2023           35          6   Snow
2  1/3/2023           30          6  Sunny

df.to_csv('indore.csv', index=True)

df.to_excel('goa.xlsx', sheet_name='weather' , index=True)

# from pandasql import sqldf

# q = "SELECT * from students"

# sqldf(q)

Handling Missing Values

df = pd.read_csv('weather_data2.csv', parse_dates = ['day'])

df

         day  temperature  windspeed   event
0 2017-01-01         32.0        6.0    Rain
1 2017-01-04          NaN        9.0   Sunny
2 2017-01-05         28.0        NaN    Snow
3 2017-01-06          NaN        7.0     NaN
4 2017-01-07         32.0        NaN    Rain
5 2017-01-08          NaN        NaN   Sunny
6 2017-01-09          NaN        NaN     NaN
7 2017-01-10         34.0        8.0  Cloudy
8 2017-01-11         40.0       12.0   Sunny

type(df['day'][0])

pandas._libs.tslibs.timestamps.Timestamp

df.set_index('day', inplace=True)
df

            temperature  windspeed   event
day
2017-01-01         32.0        6.0    Rain
2017-01-04          NaN        9.0   Sunny
2017-01-05         28.0        NaN    Snow
2017-01-06          NaN        7.0     NaN
2017-01-07         32.0        NaN    Rain
2017-01-08          NaN        NaN   Sunny
2017-01-09          NaN        NaN     NaN
2017-01-10         34.0        8.0  Cloudy
2017-01-11         40.0       12.0   Sunny

fillna

fill not available