import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
# For dealing with xlsx format.
!pip install openpyxl
Requirement already satisfied: openpyxl in c:\users\rosha\appdata\local\programs\python\python39\lib\site-packages (3.0.7) Requirement already satisfied: et-xmlfile in c:\users\rosha\appdata\local\programs\python\python39\lib\site-packages (from openpyxl) (1.1.0)
df = pd.read_excel("./airlines_dataset.xlsx")
df.head()
Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | 24/03/2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 |
1 | Air India | 1/05/2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 |
2 | Jet Airways | 9/06/2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 |
3 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 |
4 | IndiGo | 01/03/2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 |
df.tail()
Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
---|---|---|---|---|---|---|---|---|---|---|---|
10678 | Air Asia | 9/04/2019 | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 2h 30m | non-stop | No info | 4107 |
10679 | Air India | 27/04/2019 | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 2h 35m | non-stop | No info | 4145 |
10680 | Jet Airways | 27/04/2019 | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 3h | non-stop | No info | 7229 |
10681 | Vistara | 01/03/2019 | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 2h 40m | non-stop | No info | 12648 |
10682 | Air India | 9/05/2019 | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 8h 20m | 2 stops | No info | 11753 |
df.shape
(10683, 11)
df.info()
# Every column is string object except the price
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Date_of_Journey 10683 non-null object 2 Source 10683 non-null object 3 Destination 10683 non-null object 4 Route 10682 non-null object 5 Dep_Time 10683 non-null object 6 Arrival_Time 10683 non-null object 7 Duration 10683 non-null object 8 Total_Stops 10682 non-null object 9 Additional_Info 10683 non-null object 10 Price 10683 non-null int64 dtypes: int64(1), object(10) memory usage: 918.2+ KB
df.describe(include='all')
Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 10683 | 10683 | 10683 | 10683 | 10682 | 10683 | 10683 | 10683 | 10682 | 10683 | 10683.000000 |
unique | 12 | 44 | 5 | 6 | 128 | 222 | 1343 | 368 | 5 | 10 | NaN |
top | Jet Airways | 18/05/2019 | Delhi | Cochin | DEL → BOM → COK | 18:55 | 19:00 | 2h 50m | 1 stop | No info | NaN |
freq | 3849 | 504 | 4537 | 4537 | 2376 | 233 | 423 | 550 | 5625 | 8345 | NaN |
mean | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9087.064121 |
std | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4611.359167 |
min | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1759.000000 |
25% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5277.000000 |
50% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 8372.000000 |
75% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 12373.000000 |
max | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 79512.000000 |
Airline column consists of different names of airlines that flew in the given routes in the span of an year.
Date of Journey indicates the date it flew from the Destination to the Source along the route mentioned in the other columns of the dataset.
The departure time is the time the plane flew which contains the time object whereas the arrival time contains time as well as the date if arrived the next day.
The total stops column can be derived from the number of routes the plane flew.
There are only two NaN values and the additional info column has no information in more in 50 percent of data. The column usually contains the data about the food serving, baggage details and layover details.
The price is the final column which indicates the price of service of airlines which can also be used as a predictive variable.
Categorical data
print('All airlines:',df.Airline.unique(),'\n')
# Names of every airlines
print('Total airlines count:',len(df.Airline.unique()),'\n')
# Total unique airlines count
print(df.Airline.describe(),'\n')
# Description of the column consisting of most frequent airways
print(df.Airline.isnull().unique(),'\n')
# Checking if there are any null values in the column
print(df.Airline.value_counts())
# Each airline with the number of records it has
All airlines: ['IndiGo' 'Air India' 'Jet Airways' 'SpiceJet' 'Multiple carriers' 'GoAir' 'Vistara' 'Air Asia' 'Vistara Premium economy' 'Jet Airways Business' 'Multiple carriers Premium economy' 'Trujet'] Total airlines count: 12 count 10683 unique 12 top Jet Airways freq 3849 Name: Airline, dtype: object [False] Jet Airways 3849 IndiGo 2053 Air India 1752 Multiple carriers 1196 SpiceJet 818 Vistara 479 Air Asia 319 GoAir 194 Multiple carriers Premium economy 13 Jet Airways Business 6 Vistara Premium economy 3 Trujet 1 Name: Airline, dtype: int64
df.Airline.value_counts().plot.pie(figsize=(9,9),labeldistance=None, legend=True)
<AxesSubplot:ylabel='Airline'>
# Plotting Price with respect to Airline
sns.catplot(y="Price", x="Airline", data=df.sort_values("Price", ascending=False), kind="boxen", height=6,aspect=3)
<seaborn.axisgrid.FacetGrid at 0x253a1e3bd60>
date = pd.to_datetime(df.Date_of_Journey)
#Converting Date_of_Journey object into a datetime64 object
date_only = date.dt.date
# Keeping the date part only as the time by default is 00:00:00
print(date_only.sort_values(ascending=True).iloc[[0,1,df.shape[0]/2,-2,-1]],'\n')
# Printing top 2, middle and bottom 2 rows
959 2019-01-03 6336 2019-01-03 3292 2019-05-24 4997 2019-12-06 1081 2019-12-06 Name: Date_of_Journey, dtype: object
print("Earliest date is:", date_only.min())
print("Last date is:", date_only.max())
# The lower bound and the upper bound of data are January 3 to December 6, 2019.
Earliest date is: 2019-01-03 Last date is: 2019-12-06
print(date_only.value_counts(dropna=False).sort_index())
# Finding total flights in each day of the year
2019-01-03 199 2019-01-04 257 2019-01-05 277 2019-01-06 342 2019-03-03 315 2019-03-04 110 2019-03-05 90 2019-03-06 333 2019-03-15 162 2019-03-18 156 2019-03-21 423 2019-03-24 323 2019-03-27 299 2019-04-15 89 2019-04-18 67 2019-04-21 82 2019-04-24 92 2019-04-27 94 2019-05-15 405 2019-05-18 504 2019-05-21 497 2019-05-24 286 2019-05-27 382 2019-06-03 403 2019-06-04 100 2019-06-05 282 2019-06-06 503 2019-06-15 328 2019-06-18 105 2019-06-21 109 2019-06-24 351 2019-06-27 355 2019-09-03 302 2019-09-04 125 2019-09-05 484 2019-09-06 495 2019-12-03 142 2019-12-04 63 2019-12-05 259 2019-12-06 493 Name: Date_of_Journey, dtype: int64
date_only.value_counts().sort_index(ascending=False).plot(kind='barh',figsize=(10,10))
# Most flights were on May 18 (Saturday), June 6 (Thursday), September 6 (Friday) and December 6 (Friday).
<AxesSubplot:>
# Since the data consists of the year 2019 and it is repeated throughout the column, the year can be removed.
# Renaming the column
df.rename(columns={"Date_of_Journey": "Date_of_Journey_2019"}, inplace=True)
# Removing the redundant date
df.Date_of_Journey_2019 = df.Date_of_Journey_2019.str.slice(stop=-5)
df
Airline | Date_of_Journey_2019 | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | 24/03 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 |
1 | Air India | 1/05 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 |
2 | Jet Airways | 9/06 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 |
3 | IndiGo | 12/05 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 |
4 | IndiGo | 01/03 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10678 | Air Asia | 9/04 | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 2h 30m | non-stop | No info | 4107 |
10679 | Air India | 27/04 | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 2h 35m | non-stop | No info | 4145 |
10680 | Jet Airways | 27/04 | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 3h | non-stop | No info | 7229 |
10681 | Vistara | 01/03 | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 2h 40m | non-stop | No info | 12648 |
10682 | Air India | 9/05 | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 8h 20m | 2 stops | No info | 11753 |
10683 rows × 11 columns
# Separating Journey Day
df['Journey_Day'] = pd.to_datetime(df.Date_of_Journey_2019, format="%d/%m").dt.day
# Separating Journey Month
df['Journey_Month'] = pd.to_datetime(df.Date_of_Journey_2019, format="%d/%m").dt.month
# dropping journey column
df.drop(columns=['Date_of_Journey_2019'],axis=1,inplace=True)
df
Airline | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | Journey_Day | Journey_Month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 | 24 | 3 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 |
2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 | 9 | 6 |
3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 | 12 | 5 |
4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 | 1 | 3 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10678 | Air Asia | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 2h 30m | non-stop | No info | 4107 | 9 | 4 |
10679 | Air India | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 2h 35m | non-stop | No info | 4145 | 27 | 4 |
10680 | Jet Airways | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 3h | non-stop | No info | 7229 | 27 | 4 |
10681 | Vistara | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 2h 40m | non-stop | No info | 12648 | 1 | 3 |
10682 | Air India | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 8h 20m | 2 stops | No info | 11753 | 9 | 5 |
10683 rows × 12 columns
print(df.Source.unique(),'\n')
# Unique sources for airlines departure
print(df.Source.value_counts(),'\n')
# Departure source count from most to least
print(df.Source.isnull().unique(),'\n')
# Checking if there are NaN values in source
print(df.Destination.unique(),'\n')
# Unique destinations for airlines arrival
print(df.Destination.value_counts(),'\n')
# Arrival destination count from most to least
print(df.Destination.isnull().unique(),'\n')
# Checking if there are NaN values in destination
['Banglore' 'Kolkata' 'Delhi' 'Chennai' 'Mumbai'] Delhi 4537 Kolkata 2871 Banglore 2197 Mumbai 697 Chennai 381 Name: Source, dtype: int64 [False] ['New Delhi' 'Banglore' 'Cochin' 'Kolkata' 'Delhi' 'Hyderabad'] Cochin 4537 Banglore 2871 Delhi 1265 New Delhi 932 Hyderabad 697 Kolkata 381 Name: Destination, dtype: int64 [False]
delhi_count = (df.Destination.values == ['Delhi']).sum()
print("Delhi count: ", delhi_count,'\n')
# Total instances of delhi data in destination
new_delhi_count = (df.Destination.values == ['New Delhi']).sum()
print("New Delhi count: ", new_delhi_count,'\n')
# Total instances of new delhi data in destination
# Total delhi instances
print("Total delhi count: ",delhi_count+new_delhi_count)
# Delhi would still come to the third to the most visited destinations after Cochin and Banglore.
Delhi count: 1265 New Delhi count: 932 Total delhi count: 2197
# Plotting price with respect to source
sns.catplot(y="Price", x="Source", data=df.sort_values("Price", ascending=False), kind="boxen", height=6,aspect=3)
<seaborn.axisgrid.FacetGrid at 0x2539a234d30>
len(df.Route.unique())
# All the plane flew in 129 unique routes
129
df.Route[0:10]
# Checking 10 instances of routes
0 BLR → DEL 1 CCU → IXR → BBI → BLR 2 DEL → LKO → BOM → COK 3 CCU → NAG → BLR 4 BLR → NAG → DEL 5 CCU → BLR 6 BLR → BOM → DEL 7 BLR → BOM → DEL 8 BLR → BOM → DEL 9 DEL → BOM → COK Name: Route, dtype: object
df.Total_Stops[0:10]
# Checking 10 instances of total stops
0 non-stop 1 2 stops 2 2 stops 3 1 stop 4 1 stop 5 non-stop 6 1 stop 7 1 stop 8 1 stop 9 1 stop Name: Total_Stops, dtype: object
It can be be seen that the number of total stops can be derived from the routes itself. One of these columns can be dropped in the data preprocessing stage.
print(df.Route.isnull().unique(),'\n')
# Checking if there are NaN values in route
# There ARE NaN values!!!
[False True]
print(df.Total_Stops.isnull().unique(),'\n')
# Checking if there are NaN values in steps
# There ARE NaN values!!!
[False True]
# Replacing stops string values with an integer
df.Total_Stops.replace(to_replace=['non-stop','1 stop','2 stops', '3 stops','4 stops'],value=[0,1,2,3,4], inplace=True)
stops = df.Total_Stops
stops.unique()
array([ 0., 2., 1., 3., nan, 4.])
# Counting the nan values
stops.isnull().value_counts()
False 10682 True 1 Name: Total_Stops, dtype: int64
# Finding the index of nan values
print(np.argwhere(np.isnan(np.array(stops))))
df.iloc[9039]
[[9039]]
Airline Air India Source Delhi Destination Cochin Route NaN Dep_Time 09:45 Arrival_Time 09:25 07 May Duration 23h 40m Total_Stops NaN Additional_Info No info Price 7480 Journey_Day 6 Journey_Month 5 Name: 9039, dtype: object
# Finding the most common routes and stops on the same airline with same sources, destinations and durations.
df[['Route','Total_Stops']].where((df.Source=='Delhi') & (df.Destination=='Cochin') &(df.Airline=='Air India') & (df.Duration=='23h 40m')).dropna().set_index('Route').value_counts()
# Since the most common route is DEL → HYD → MAA → COK with total_stops 2, the values shall be replaced now.
Total_Stops 2.0 14 1.0 12 dtype: int64
# Filling the values of routes
df.iloc[9039,3] = 'DEL → HYD → MAA → COK'
# Filling the value of stops
df.iloc[9039,7] = 2
# The value is replaced and no other nan values exist.
print(df.iloc[9039],'\n')
print(df.Total_Stops.isnull().value_counts())
Airline Air India Source Delhi Destination Cochin Route DEL → HYD → MAA → COK Dep_Time 09:45 Arrival_Time 09:25 07 May Duration 23h 40m Total_Stops 2.0 Additional_Info No info Price 7480 Journey_Day 6 Journey_Month 5 Name: 9039, dtype: object False 10683 Name: Total_Stops, dtype: int64
# Converting the float values to integers
df.Total_Stops = df.Total_Stops.apply(np.int64)
df.Total_Stops
0 0 1 2 2 2 3 1 4 1 .. 10678 0 10679 0 10680 0 10681 0 10682 2 Name: Total_Stops, Length: 10683, dtype: int64
df.head(4)
Airline | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | Journey_Day | Journey_Month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | 0 | No info | 3897 | 24 | 3 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 | No info | 7662 | 1 | 5 |
2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 | No info | 13882 | 9 | 6 |
3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 | No info | 6218 | 12 | 5 |
df.Additional_Info.value_counts(normalize=True).mul(100).round(8)
No info 78.114762 In-flight meal not included 18.552841 No check-in baggage included 2.995413 1 Long layover 0.177853 Change airports 0.065525 Business class 0.037443 No Info 0.028082 2 Long layover 0.009361 1 Short layover 0.009361 Red-eye flight 0.009361 Name: Additional_Info, dtype: float64
Since 78% data in this column is empty or consists of "No info" which is of no use, it is safe to drop the column as the the data which do exist are not as too crucial.
df.drop(['Additional_Info'], axis = 1, inplace = True)
df.head(3)
Airline | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Price | Journey_Day | Journey_Month | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | 0 | 3897 | 24 | 3 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 | 7662 | 1 | 5 |
2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 | 13882 | 9 | 6 |
# Checking if any nan values on departure time
df.Dep_Time.isna().unique()
array([False])
# Formatting data into a more structured format
df['Dep_Hour'] = pd.to_datetime(df.Dep_Time).dt.hour
df['Dep_Min'] = pd.to_datetime(df.Dep_Time).dt.minute
# df.drop(['Dep_Time'], axis=1, inplace=True)
# dep_time = pd.to_datetime(df.Dep_Time).dt.time
# print(dep_time)
# df.Dep_Time = dep_time
df.head()
Airline | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Price | Journey_Day | Journey_Month | Dep_Hour | Dep_Min | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | 0 | 3897 | 24 | 3 | 22 | 20 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 | 7662 | 1 | 5 | 5 | 50 |
2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 | 13882 | 9 | 6 | 9 | 25 |
3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 | 6218 | 12 | 5 | 18 | 5 |
4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 | 13302 | 1 | 3 | 16 | 50 |
# df.Arrival_Time
df['Arrival_Hour'] = pd.to_datetime(df.Arrival_Time).dt.hour
df['Arrival_Min'] = pd.to_datetime(df.Arrival_Time).dt.minute
# df.drop(['Arrival_Time'],axis=1, inplace=True)
# arrival_time = pd.to_datetime(df.Arrival_Time).dt.time
# df.Arrival_Time = arrival_time
df.tail()
Airline | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Price | Journey_Day | Journey_Month | Dep_Hour | Dep_Min | Arrival_Hour | Arrival_Min | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10678 | Air Asia | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 2h 30m | 0 | 4107 | 9 | 4 | 19 | 55 | 22 | 25 |
10679 | Air India | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 2h 35m | 0 | 4145 | 27 | 4 | 20 | 45 | 23 | 20 |
10680 | Jet Airways | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 3h | 0 | 7229 | 27 | 4 | 8 | 20 | 11 | 20 |
10681 | Vistara | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 2h 40m | 0 | 12648 | 1 | 3 | 11 | 30 | 14 | 10 |
10682 | Air India | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 8h 20m | 2 | 11753 | 9 | 5 | 10 | 55 | 19 | 15 |
df["Dep_Time"]
0 22:20 1 05:50 2 09:25 3 18:05 4 16:50 ... 10678 19:55 10679 20:45 10680 08:20 10681 11:30 10682 10:55 Name: Dep_Time, Length: 10683, dtype: object
# Duration can be extracted like this.
dep_time = pd.to_datetime(df.Dep_Time).dt.time
df.Dep_Time = dep_time
arrival_time = pd.to_datetime(df.Arrival_Time).dt.time
df.Arrival_Time = arrival_time
from datetime import datetime, date
def time_between(df):
return datetime.combine(date.today(), df["Arrival_Time"]) - datetime.combine(date.today(), df["Dep_Time"])
duration = df.apply(time_between, axis=1).astype('str')
duration
0 -1 days +02:50:00 1 0 days 07:25:00 2 -1 days +19:00:00 3 0 days 05:25:00 4 0 days 04:45:00 ... 10678 0 days 02:30:00 10679 0 days 02:35:00 10680 0 days 03:00:00 10681 0 days 02:40:00 10682 0 days 08:20:00 Length: 10683, dtype: object
df['Duration_hours']=duration.str.slice(start=-8,stop=-6)
df['Duration_mins']=duration.str.slice(start=-5,stop=-3)
df
Airline | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Price | Journey_Day | Journey_Month | Dep_Hour | Dep_Min | Arrival_Hour | Arrival_Min | Duration_hours | Duration_mins | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20:00 | 01:10:00 | 2h 50m | 0 | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 02 | 50 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50:00 | 13:15:00 | 7h 25m | 2 | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 07 | 25 |
2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25:00 | 04:25:00 | 19h | 2 | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 00 |
3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 18:05:00 | 23:30:00 | 5h 25m | 1 | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 05 | 25 |
4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 16:50:00 | 21:35:00 | 4h 45m | 1 | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 04 | 45 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10678 | Air Asia | Kolkata | Banglore | CCU → BLR | 19:55:00 | 22:25:00 | 2h 30m | 0 | 4107 | 9 | 4 | 19 | 55 | 22 | 25 | 02 | 30 |
10679 | Air India | Kolkata | Banglore | CCU → BLR | 20:45:00 | 23:20:00 | 2h 35m | 0 | 4145 | 27 | 4 | 20 | 45 | 23 | 20 | 02 | 35 |
10680 | Jet Airways | Banglore | Delhi | BLR → DEL | 08:20:00 | 11:20:00 | 3h | 0 | 7229 | 27 | 4 | 8 | 20 | 11 | 20 | 03 | 00 |
10681 | Vistara | Banglore | New Delhi | BLR → DEL | 11:30:00 | 14:10:00 | 2h 40m | 0 | 12648 | 1 | 3 | 11 | 30 | 14 | 10 | 02 | 40 |
10682 | Air India | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55:00 | 19:15:00 | 8h 20m | 2 | 11753 | 9 | 5 | 10 | 55 | 19 | 15 | 08 | 20 |
10683 rows × 17 columns
# Dropping redundant columns
df.drop(columns=['Duration','Dep_Time','Arrival_Time'],inplace=True)
df
Airline | Source | Destination | Route | Total_Stops | Price | Journey_Day | Journey_Month | Dep_Hour | Dep_Min | Arrival_Hour | Arrival_Min | Duration_hours | Duration_mins | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 0 | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 02 | 50 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 2 | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 07 | 25 |
2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 2 | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 00 |
3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 1 | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 05 | 25 |
4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 1 | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 04 | 45 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10678 | Air Asia | Kolkata | Banglore | CCU → BLR | 0 | 4107 | 9 | 4 | 19 | 55 | 22 | 25 | 02 | 30 |
10679 | Air India | Kolkata | Banglore | CCU → BLR | 0 | 4145 | 27 | 4 | 20 | 45 | 23 | 20 | 02 | 35 |
10680 | Jet Airways | Banglore | Delhi | BLR → DEL | 0 | 7229 | 27 | 4 | 8 | 20 | 11 | 20 | 03 | 00 |
10681 | Vistara | Banglore | New Delhi | BLR → DEL | 0 | 12648 | 1 | 3 | 11 | 30 | 14 | 10 | 02 | 40 |
10682 | Air India | Delhi | Cochin | DEL → GOI → BOM → COK | 2 | 11753 | 9 | 5 | 10 | 55 | 19 | 15 | 08 | 20 |
10683 rows × 14 columns
# Since route with the total_stops is redundant, the route column can be removed.
df.drop(columns=['Route'],inplace=True)
df
Airline | Source | Destination | Total_Stops | Price | Journey_Day | Journey_Month | Dep_Hour | Dep_Min | Arrival_Hour | Arrival_Min | Duration_hours | Duration_mins | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | 0 | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 02 | 50 |
1 | Air India | Kolkata | Banglore | 2 | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 07 | 25 |
2 | Jet Airways | Delhi | Cochin | 2 | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 00 |
3 | IndiGo | Kolkata | Banglore | 1 | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 05 | 25 |
4 | IndiGo | Banglore | New Delhi | 1 | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 04 | 45 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10678 | Air Asia | Kolkata | Banglore | 0 | 4107 | 9 | 4 | 19 | 55 | 22 | 25 | 02 | 30 |
10679 | Air India | Kolkata | Banglore | 0 | 4145 | 27 | 4 | 20 | 45 | 23 | 20 | 02 | 35 |
10680 | Jet Airways | Banglore | Delhi | 0 | 7229 | 27 | 4 | 8 | 20 | 11 | 20 | 03 | 00 |
10681 | Vistara | Banglore | New Delhi | 0 | 12648 | 1 | 3 | 11 | 30 | 14 | 10 | 02 | 40 |
10682 | Air India | Delhi | Cochin | 2 | 11753 | 9 | 5 | 10 | 55 | 19 | 15 | 08 | 20 |
10683 rows × 13 columns
sns.displot(df, x=df.Price,height=8.27, aspect=11.7/8.27)
# Visualizing the distribution of price
# The data is heavily distributed between 0 to 30,000.
<seaborn.axisgrid.FacetGrid at 0x253accc6730>
df.duplicated().value_counts()
# Total of 223 duplicated columns
False 10460 True 223 dtype: int64
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df.duplicated().value_counts()
# No any data is duplicated.
False 10460 dtype: int64
df
Airline | Source | Destination | Total_Stops | Price | Journey_Day | Journey_Month | Dep_Hour | Dep_Min | Arrival_Hour | Arrival_Min | Duration_hours | Duration_mins | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | 0 | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 02 | 50 |
1 | Air India | Kolkata | Banglore | 2 | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 07 | 25 |
2 | Jet Airways | Delhi | Cochin | 2 | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 00 |
3 | IndiGo | Kolkata | Banglore | 1 | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 05 | 25 |
4 | IndiGo | Banglore | New Delhi | 1 | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 04 | 45 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10455 | Air Asia | Kolkata | Banglore | 0 | 4107 | 9 | 4 | 19 | 55 | 22 | 25 | 02 | 30 |
10456 | Air India | Kolkata | Banglore | 0 | 4145 | 27 | 4 | 20 | 45 | 23 | 20 | 02 | 35 |
10457 | Jet Airways | Banglore | Delhi | 0 | 7229 | 27 | 4 | 8 | 20 | 11 | 20 | 03 | 00 |
10458 | Vistara | Banglore | New Delhi | 0 | 12648 | 1 | 3 | 11 | 30 | 14 | 10 | 02 | 40 |
10459 | Air India | Delhi | Cochin | 2 | 11753 | 9 | 5 | 10 | 55 | 19 | 15 | 08 | 20 |
10460 rows × 13 columns
The data in the airline column is nominal since the order of the data does not matter. The data is handled with one hot encoding.
df.Airline
0 IndiGo 1 Air India 2 Jet Airways 3 IndiGo 4 IndiGo ... 10455 Air Asia 10456 Air India 10457 Jet Airways 10458 Vistara 10459 Air India Name: Airline, Length: 10460, dtype: object
# Generating dummy variables and dropping the first to avoid the dummy variable trap
Airline = pd.get_dummies(df.Airline, drop_first=True,prefix='Airline')
Airline
Airline_Air India | Airline_GoAir | Airline_IndiGo | Airline_Jet Airways | Airline_Jet Airways Business | Airline_Multiple carriers | Airline_Multiple carriers Premium economy | Airline_SpiceJet | Airline_Trujet | Airline_Vistara | Airline_Vistara Premium economy | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10455 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10456 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10457 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10458 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
10459 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10460 rows × 11 columns
df.Source.unique()
array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)
Source = pd.get_dummies(df.Source, drop_first=True,prefix='Source')
Source
Source_Chennai | Source_Delhi | Source_Kolkata | Source_Mumbai | |
---|---|---|---|---|
0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 1 | 0 |
2 | 0 | 1 | 0 | 0 |
3 | 0 | 0 | 1 | 0 |
4 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... |
10455 | 0 | 0 | 1 | 0 |
10456 | 0 | 0 | 1 | 0 |
10457 | 0 | 0 | 0 | 0 |
10458 | 0 | 0 | 0 | 0 |
10459 | 0 | 1 | 0 | 0 |
10460 rows × 4 columns
# Similar with the destination column
Destination = pd.get_dummies(df.Destination, drop_first=True,prefix='Destination')
Destination
Destination_Cochin | Destination_Delhi | Destination_Hyderabad | Destination_Kolkata | Destination_New Delhi | |
---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 1 |
1 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 1 |
... | ... | ... | ... | ... | ... |
10455 | 0 | 0 | 0 | 0 | 0 |
10456 | 0 | 0 | 0 | 0 | 0 |
10457 | 0 | 1 | 0 | 0 | 0 |
10458 | 0 | 0 | 0 | 0 | 1 |
10459 | 1 | 0 | 0 | 0 | 0 |
10460 rows × 5 columns
# All the encoded columns are now appended into the dataset
df = pd.concat([df,Airline,Source,Destination], axis=1)
df.drop(columns=['Airline','Source','Destination'], axis=1, inplace=True)
df
Total_Stops | Price | Journey_Day | Journey_Month | Dep_Hour | Dep_Min | Arrival_Hour | Arrival_Min | Duration_hours | Duration_mins | ... | Airline_Vistara Premium economy | Source_Chennai | Source_Delhi | Source_Kolkata | Source_Mumbai | Destination_Cochin | Destination_Delhi | Destination_Hyderabad | Destination_Kolkata | Destination_New Delhi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 02 | 50 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 2 | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 07 | 25 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 2 | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 00 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
3 | 1 | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 05 | 25 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 04 | 45 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10455 | 0 | 4107 | 9 | 4 | 19 | 55 | 22 | 25 | 02 | 30 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
10456 | 0 | 4145 | 27 | 4 | 20 | 45 | 23 | 20 | 02 | 35 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
10457 | 0 | 7229 | 27 | 4 | 8 | 20 | 11 | 20 | 03 | 00 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
10458 | 0 | 12648 | 1 | 3 | 11 | 30 | 14 | 10 | 02 | 40 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
10459 | 2 | 11753 | 9 | 5 | 10 | 55 | 19 | 15 | 08 | 20 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
10460 rows × 30 columns
sns.boxplot(x=df.Price)
<AxesSubplot:xlabel='Price'>
# Finding outliers in the price column using Z-score
price_arr = np.array(df.Price)
outliers_index = []
def detect_outliers(data):
threshold = 3 #3 standard deviation
mean = np.mean(data)
std = np.std(data)
for i,value in enumerate(data):
z_score = (value-mean)/std
if np.abs(z_score)>threshold:
outliers_index.append(i)
return outliers_index
outliers_index = detect_outliers(price_arr)
print(outliers_index)
[123, 396, 486, 510, 597, 628, 657, 784, 825, 935, 945, 958, 974, 1194, 1244, 1339, 1420, 1462, 1474, 1625, 1650, 1778, 1909, 2044, 2087, 2096, 2483, 2543, 2604, 2621, 2677, 2904, 3010, 3088, 3231, 3372, 3505, 3667, 3974, 4476, 4627, 4779, 4960, 5080, 5312, 5378, 5597, 5636, 5645, 5654, 5673, 5680, 5789, 5916, 6240, 6332, 6497, 6509, 6526, 6901, 7253, 7258, 7279, 7437, 7454, 7515, 7608, 7619, 7649, 7794, 7909, 7967, 8325, 8344, 8408, 8466, 8715, 8796, 8813, 8844, 8873, 9038, 9081, 9455, 9536, 9785, 9862, 9920, 9965, 9987, 9992, 10146, 10157, 10176, 10229, 10299]
df.iloc[123].Price
27430
# Dropping all the outliers
df.drop(outliers_index,axis=0,inplace=True)
# Previous outlier data on the column has been deleted!
df.iloc[123].Price
3540
sns.boxplot(x=df.Price)
<AxesSubplot:xlabel='Price'>
df.columns
Index(['Total_Stops', 'Price', 'Journey_Day', 'Journey_Month', 'Dep_Hour', 'Dep_Min', 'Arrival_Hour', 'Arrival_Min', 'Duration_hours', 'Duration_mins', 'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business', 'Airline_Multiple carriers', 'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet', 'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy', 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai', 'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata', 'Destination_New Delhi'], dtype='object')
# Rearranging the columns
X = df.loc[:,['Journey_Day', 'Journey_Month', 'Dep_Hour',
'Dep_Min', 'Arrival_Hour', 'Arrival_Min', 'Duration_hours',
'Duration_mins', 'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo',
'Airline_Jet Airways', 'Airline_Jet Airways Business',
'Airline_Multiple carriers',
'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',
'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad',
'Destination_Kolkata', 'Destination_New Delhi','Total_Stops']]
X.tail()
Journey_Day | Journey_Month | Dep_Hour | Dep_Min | Arrival_Hour | Arrival_Min | Duration_hours | Duration_mins | Airline_Air India | Airline_GoAir | ... | Source_Chennai | Source_Delhi | Source_Kolkata | Source_Mumbai | Destination_Cochin | Destination_Delhi | Destination_Hyderabad | Destination_Kolkata | Destination_New Delhi | Total_Stops | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10455 | 9 | 4 | 19 | 55 | 22 | 25 | 02 | 30 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10456 | 27 | 4 | 20 | 45 | 23 | 20 | 02 | 35 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10457 | 27 | 4 | 8 | 20 | 11 | 20 | 03 | 00 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
10458 | 1 | 3 | 11 | 30 | 14 | 10 | 02 | 40 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
10459 | 9 | 5 | 10 | 55 | 19 | 15 | 08 | 20 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2 |
5 rows × 29 columns
X_prime = df.loc[:,['Journey_Day', 'Journey_Month', 'Dep_Hour',
'Dep_Min', 'Arrival_Hour', 'Arrival_Min', 'Duration_hours',
'Duration_mins','Total_Stops']]
# Checking correlation
plt.figure(figsize=(10,10))
sns.heatmap(X_prime.corr(),annot=True)
plt.show()
# Highly correlated values can be dropped from here.