In [1]:
# import required packages

import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

## Preporcessing

In [2]:
# read the dataset

df = pd.read_excel("input_raw_data.xlsx")
df

Unnamed: 0.1,Unnamed: 0,Channel,Week Day,TimeBand,Share,AMA,rate,daily reach,cume reach,ATS,Unrolled
0,7'23,Aaj Tak,Saturday,02:00:00 - 02:30:00,0.081305,0.123363,0.000433,3.70,3.700893,00:01:00,0.000000
1,7'23,Aaj Tak,Saturday,02:30:00 - 03:00:00,0.469995,0.394070,0.001383,11.82,11.822103,00:01:00,0.000000
2,7'23,Aaj Tak,Saturday,03:00:00 - 03:30:00,1.723084,0.361537,0.001269,10.85,10.846120,00:01:00,0.000000
3,7'23,Aaj Tak,Saturday,03:30:00 - 04:00:00,2.019206,0.251790,0.000884,7.55,7.553692,00:01:00,0.000000
4,7'23,Aaj Tak,Saturday,04:00:00 - 04:30:00,1.163916,0.333603,0.001171,10.01,10.008100,00:01:00,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
12091,15'23,Aaj Tak,Friday,23:30:00 - 24:00:00,0.315975,6.315608,0.028382,52.33,52.334241,00:03:37,1.870176
12092,15'23,Aaj Tak,Friday,24:00:00 - 24:30:00,0.690376,8.010992,0.036001,33.65,33.651447,00:07:09,6.204409
12093,15'23,Aaj Tak,Friday,24:30:00 - 25:00:00,1.313761,8.575085,0.038536,26.97,26.974041,00:09:32,6.526442
12094,15'23,Aaj Tak,Friday,25:00:00 - 25:30:00,1.141046,4.483507,0.020149,37.21,37.214790,00:03:37,5.011646


In [3]:
df.rename(columns={'Unnamed: 0':'Week number'}, inplace=True)

In [4]:
df.head()

Unnamed: 0,Week number,Channel,Week Day,TimeBand,Share,AMA,rate,daily reach,cume reach,ATS,Unrolled
0,7'23,Aaj Tak,Saturday,02:00:00 - 02:30:00,0.081305,0.123363,0.000433,3.7,3.700893,00:01:00,0.0
1,7'23,Aaj Tak,Saturday,02:30:00 - 03:00:00,0.469995,0.39407,0.001383,11.82,11.822103,00:01:00,0.0
2,7'23,Aaj Tak,Saturday,03:00:00 - 03:30:00,1.723084,0.361537,0.001269,10.85,10.84612,00:01:00,0.0
3,7'23,Aaj Tak,Saturday,03:30:00 - 04:00:00,2.019206,0.25179,0.000884,7.55,7.553692,00:01:00,0.0
4,7'23,Aaj Tak,Saturday,04:00:00 - 04:30:00,1.163916,0.333603,0.001171,10.01,10.0081,00:01:00,0.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12096 entries, 0 to 12095
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Week number  12096 non-null  object 
 1   Channel      12096 non-null  object 
 2   Week Day     12096 non-null  object 
 3   TimeBand     12096 non-null  object 
 4   Share        12096 non-null  float64
 5   AMA          12096 non-null  float64
 6   rate         12096 non-null  float64
 7   daily reach  12096 non-null  float64
 8   cume reach   12096 non-null  float64
 9   ATS          12096 non-null  object 
 10  Unrolled     12096 non-null  float64
dtypes: float64(6), object(5)
memory usage: 1.0+ MB


In [6]:
df.describe()

Unnamed: 0,Share,AMA,rate,daily reach,cume reach,Unrolled
count,12096.0,12096.0,12096.0,12096.0,12096.0,12096.0
mean,0.904877,3.638381,0.031671,30.726294,30.726317,3.487959
std,3.77326,4.987969,0.074512,33.505783,33.505793,5.746293
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.089353,0.122776,0.003831,3.0,3.002531,0.0
50%,0.199747,2.192741,0.015068,22.73,22.732177,0.974788
75%,0.482635,5.174398,0.02907,46.93,46.932208,4.620285
max,100.0,42.072407,1.356598,229.33,229.334577,60.765814


In [7]:
# Count values of Week number
df['Week number'].value_counts()   # we have records of from 7 to 15

Week number
7'23     1344
8'23     1344
9'23     1344
10'23    1344
11'23    1344
12'23    1344
13'23    1344
14'23    1344
15'23    1344
Name: count, dtype: int64

In [8]:
# Count values of Channel
df['Channel'].value_counts()

Channel
Aaj Tak    12096
Name: count, dtype: int64

In [9]:
# Count values of Week Day
df['Week Day'].value_counts() # from Sunday to Monday

Week Day
Saturday     1728
Sunday       1728
Monday       1728
Tuesday      1728
Wednesday    1728
Thursday     1728
Friday       1728
Name: count, dtype: int64

In [10]:
# count values of TimeBand
df['TimeBand'].value_counts()

TimeBand
02:00:00 - 02:30:00    252
02:30:00 - 03:00:00    252
15:00:00 - 15:30:00    252
15:30:00 - 16:00:00    252
16:00:00 - 16:30:00    252
16:30:00 - 17:00:00    252
17:00:00 - 17:30:00    252
17:30:00 - 18:00:00    252
18:00:00 - 18:30:00    252
18:30:00 - 19:00:00    252
19:00:00 - 19:30:00    252
19:30:00 - 20:00:00    252
20:00:00 - 20:30:00    252
20:30:00 - 21:00:00    252
21:00:00 - 21:30:00    252
21:30:00 - 22:00:00    252
22:00:00 - 22:30:00    252
22:30:00 - 23:00:00    252
23:00:00 - 23:30:00    252
23:30:00 - 24:00:00    252
24:00:00 - 24:30:00    252
24:30:00 - 25:00:00    252
25:00:00 - 25:30:00    252
14:30:00 - 15:00:00    252
14:00:00 - 14:30:00    252
13:30:00 - 14:00:00    252
07:30:00 - 08:00:00    252
03:00:00 - 03:30:00    252
03:30:00 - 04:00:00    252
04:00:00 - 04:30:00    252
04:30:00 - 05:00:00    252
05:00:00 - 05:30:00    252
05:30:00 - 06:00:00    252
06:00:00 - 06:30:00    252
06:30:00 - 07:00:00    252
07:00:00 - 07:30:00    252
08:00:00 - 08:30:00

## Label Encoding

In [11]:
df.columns

Index(['Week number', 'Channel', 'Week Day', 'TimeBand', 'Share', 'AMA',
       'rate', 'daily reach', 'cume reach', 'ATS', 'Unrolled'],
      dtype='object')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12096 entries, 0 to 12095
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Week number  12096 non-null  object 
 1   Channel      12096 non-null  object 
 2   Week Day     12096 non-null  object 
 3   TimeBand     12096 non-null  object 
 4   Share        12096 non-null  float64
 5   AMA          12096 non-null  float64
 6   rate         12096 non-null  float64
 7   daily reach  12096 non-null  float64
 8   cume reach   12096 non-null  float64
 9   ATS          12096 non-null  object 
 10  Unrolled     12096 non-null  float64
dtypes: float64(6), object(5)
memory usage: 1.0+ MB


In [13]:
# Need to Label Encode columns like: 
# As of now Channel is not needed to encode as we are checking with AajTak only
# 1: Week Day
# 2: TimeBand

In [14]:
# 1: Week Day

weekDay_le = LabelEncoder()
df['Week_Day_Encoded'] = weekDay_le.fit_transform(df['Week Day'])

In [15]:
# L1 = list(weekDay_le.inverse_transform(df['Week_Day_Encoded']))
# d1 = dict(zip(weekDay_le.classes_, weekDay_le.transform(weekDay_le.classes_)))
# print (d1)

# # Output: {'Friday': 0, 'Monday': 1, 'Saturday': 2, 'Sunday': 3, 'Thursday': 4, 'Tuesday': 5, 'Wednesday': 6}

In [16]:
# 2: TimeBand

timeBand_le = LabelEncoder()
df['Time_Band_Encoded'] = timeBand_le.fit_transform(df['TimeBand'])

In [17]:
# L2 = list(timeBand_le.inverse_transform(df['Time_Band_Encoded']))
# d2 = dict(zip(timeBand_le.classes_, timeBand_le.transform(timeBand_le.classes_)))
# print(d2)

# # # Output: {'02:00:00 - 02:30:00': 0, '02:30:00 - 03:00:00': 1, '03:00:00 - 03:30:00': 2, '03:30:00 - 04:00:00': 3, 
#              '04:00:00 - 04:30:00': 4, '04:30:00 - 05:00:00': 5, '05:00:00 - 05:30:00': 6, '05:30:00 - 06:00:00': 7, 
#              '06:00:00 - 06:30:00': 8, '06:30:00 - 07:00:00': 9, '07:00:00 - 07:30:00': 10, '07:30:00 - 08:00:00': 11, 
#              '08:00:00 - 08:30:00': 12, '08:30:00 - 09:00:00': 13, '09:00:00 - 09:30:00': 14, '09:30:00 - 10:00:00': 15, 
#              '10:00:00 - 10:30:00': 16, '10:30:00 - 11:00:00': 17, '11:00:00 - 11:30:00': 18, '11:30:00 - 12:00:00': 19, 
#              '12:00:00 - 12:30:00': 20, '12:30:00 - 13:00:00': 21, '13:00:00 - 13:30:00': 22, '13:30:00 - 14:00:00': 23, 
#              '14:00:00 - 14:30:00': 24, '14:30:00 - 15:00:00': 25, '15:00:00 - 15:30:00': 26, '15:30:00 - 16:00:00': 27, 
#              '16:00:00 - 16:30:00': 28, '16:30:00 - 17:00:00': 29, '17:00:00 - 17:30:00': 30, '17:30:00 - 18:00:00': 31, 
#              '18:00:00 - 18:30:00': 32, '18:30:00 - 19:00:00': 33, '19:00:00 - 19:30:00': 34, '19:30:00 - 20:00:00': 35, 
#              '20:00:00 - 20:30:00': 36, '20:30:00 - 21:00:00': 37, '21:00:00 - 21:30:00': 38, '21:30:00 - 22:00:00': 39, 
#              '22:00:00 - 22:30:00': 40, '22:30:00 - 23:00:00': 41, '23:00:00 - 23:30:00': 42, '23:30:00 - 24:00:00': 43, 
#              '24:00:00 - 24:30:00': 44, '24:30:00 - 25:00:00': 45, '25:00:00 - 25:30:00': 46, '25:30:00 - 26:00:00': 47}

In [18]:
df.head()

Unnamed: 0,Week number,Channel,Week Day,TimeBand,Share,AMA,rate,daily reach,cume reach,ATS,Unrolled,Week_Day_Encoded,Time_Band_Encoded
0,7'23,Aaj Tak,Saturday,02:00:00 - 02:30:00,0.081305,0.123363,0.000433,3.7,3.700893,00:01:00,0.0,2,0
1,7'23,Aaj Tak,Saturday,02:30:00 - 03:00:00,0.469995,0.39407,0.001383,11.82,11.822103,00:01:00,0.0,2,1
2,7'23,Aaj Tak,Saturday,03:00:00 - 03:30:00,1.723084,0.361537,0.001269,10.85,10.84612,00:01:00,0.0,2,2
3,7'23,Aaj Tak,Saturday,03:30:00 - 04:00:00,2.019206,0.25179,0.000884,7.55,7.553692,00:01:00,0.0,2,3
4,7'23,Aaj Tak,Saturday,04:00:00 - 04:30:00,1.163916,0.333603,0.001171,10.01,10.0081,00:01:00,0.0,2,4


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12096 entries, 0 to 12095
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Week number        12096 non-null  object 
 1   Channel            12096 non-null  object 
 2   Week Day           12096 non-null  object 
 3   TimeBand           12096 non-null  object 
 4   Share              12096 non-null  float64
 5   AMA                12096 non-null  float64
 6   rate               12096 non-null  float64
 7   daily reach        12096 non-null  float64
 8   cume reach         12096 non-null  float64
 9   ATS                12096 non-null  object 
 10  Unrolled           12096 non-null  float64
 11  Week_Day_Encoded   12096 non-null  int32  
 12  Time_Band_Encoded  12096 non-null  int32  
dtypes: float64(6), int32(2), object(5)
memory usage: 1.1+ MB


## Model Development : RandomForestRegressor

In [20]:
# Splitting into X and y 

X = df[['Share', 'AMA', 'rate','daily reach', 'cume reach','Week_Day_Encoded','Time_Band_Encoded']]
y = df[['Unrolled']]

In [33]:
# Splitting into training and testing datasets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

In [34]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9676, 7), (2420, 7), (9676, 1), (2420, 1))

In [35]:
X_train.head()

Unnamed: 0,Share,AMA,rate,daily reach,cume reach,Week_Day_Encoded,Time_Band_Encoded
11232,0.043364,0.080953,0.000357,2.43,2.428586,5,0
11118,0.31928,7.050287,0.031111,45.37,45.372124,2,30
9301,0.090855,5.284389,0.023781,60.32,60.31794,6,37
3222,0.402614,0.207835,0.000917,4.82,4.815343,6,6
10322,12.873856,0.064336,0.01522,1.93,1.930081,4,2


In [36]:
y_train[:5]

Unnamed: 0,Unrolled
11232,0.0
11118,0.0
9301,6.285889
3222,0.47324
10322,0.0


In [37]:
X_test.head()

Unnamed: 0,Share,AMA,rate,daily reach,cume reach,Week_Day_Encoded,Time_Band_Encoded
468,0.152596,9.820626,0.043337,94.61,94.614234,1,36
11620,0.0,0.0,0.0,0.0,0.0,6,4
538,0.969294,3.181874,0.014043,34.3,34.298911,6,10
5265,0.064741,2.991051,0.013427,41.62,41.619074,6,33
7484,0.0,0.0,0.0,0.0,0.0,3,44


In [38]:
y_test[:5]

Unnamed: 0,Unrolled
468,12.150886
11620,0.0
538,1.480424
5265,5.781056
7484,0.0


In [39]:
# Train Random Forest Regression model

model = RandomForestRegressor(random_state = 42)
model.fit(X_train, y_train)

In [40]:
# Make predictions on train data

y_pred_train = model.predict(X_train)

In [72]:
acc_train = r2_score(y_train, y_pred_train)
print("The Accuracy of Training Dataset is : ",acc_train*100)

The Accuracy of Training Dataset is :  95.65798927048185


In [42]:
# Make predictions on test data

y_pred_test = model.predict(X_test)

In [71]:
acc_test = r2_score(y_test, y_pred_test)
print("The Accuracy of Test Dataset is : ",acc_test*100)

The Accuracy of Test Dataset is :  71.01332045918515


In [70]:
# # Saving Model

# import pickle

# with open('aajTak_model.pkl','wb') as file1:
#     pickle.dump(model,file1)   

## Hyperparameter Tuning for Random Forest Regression

In [45]:
# Hyperparameter Tuning

hyp_model = RandomForestRegressor()

hyp = {
"n_estimators": np.arange(10,50,10),
'criterion':["squared_error", "absolute_error"],
'max_depth':np.arange(3,50),
# 'min_samples_split':np.arange(2,5),
# 'min_samples_leaf':np.arange(1,5),
'random_state':np.arange(0,100)
}

In [46]:
rscv = RandomizedSearchCV(hyp_model, hyp, cv=5)
rscv.fit(X_train,y_train)

In [47]:
rscv.best_params_

{'random_state': 49,
 'n_estimators': 20,
 'max_depth': 39,
 'criterion': 'absolute_error'}

In [48]:
best_model = rscv.best_estimator_

In [49]:
best_model.fit(X_train, y_train)

In [50]:
ypredtn = best_model.predict(X_train)

In [51]:
acctn = r2_score(y_train, ypredtn)
print("The Accuracy of Training Dataset after hyperparameter tuning is : ",acctn*100)

The Accuracy of Training Dataset after hyperparameter tuning is :  94.41670975802535


In [52]:
ypredts = best_model.predict(X_test)

In [54]:
accts = r2_score(y_test, ypredts)
print("The Accuracy of Testing Dataset after hyperparameter tuning is : ",accts*100)

The Accuracy of Testing Dataset after hyperparameter tuning is :  69.97941529616791


In [73]:
# # Saving Model

# import pickle

# with open('aajTak_fineTune_model.pkl','wb') as file:
#     pickle.dump(best_model,file)   

In [74]:
# # Saving the LabelEncoders for weekDay

# with open('weekDay_le.pkl','wb') as f1:
#     pickle.dump(weekDay_le,f1)

In [75]:
# # Saving the LabelEncoders for timeBand

# with open('timeBand_le.pkl','wb') as f2:
#     pickle.dump(timeBand_le,f2)

## UserTest Function - Prediction Script

In [1]:
# import required packages

import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

import pickle

In [2]:
# load the saved model using pickle
with open('aajTak_model.pkl', 'rb') as f1:
    model1 = pickle.load(f1)

In [3]:
# # load the saved model using pickle
# with open('aajTak_fineTune_model.pkl', 'rb') as file:
#     model = pickle.load(file)

# Load the saved weekDay label encoder object using pickle
with open('weekDay_le.pkl','rb') as file1:
    weekDay_le = pickle.load(file1)

# Load the saved timeBand label encoder object using pickle
with open('timeBand_le.pkl','rb') as file2:
    timeBand_le = pickle.load(file2)

In [4]:
# define the prediction function
# X = df[['Share', 'AMA', 'rate','daily reach', 'cume reach','Week_Day_Encoded','Time_Band_Encoded']]
# y = df[['Unrolled']]


def predict_unrolled_value(Share, AMA, rate, daily_reach, cume_reach, Week_Day, Time_Band):
    
    # create a DataFrame with the input variables
    
    # encode the Week_Day using the loaded LabelEncoder object
    weekDay_encoded = weekDay_le.transform([Week_Day])[0]
    
    # encode the Time_Band using the loaded LabelEncoder object
    Time_Band_encoded = timeBand_le.transform([Time_Band])[0]
    
    input_data = pd.DataFrame({'Share': [Share], 
                               'AMA': [AMA], 
                               'rate': [rate],
                               'daily reach': [daily_reach], 
                               'cume reach': [cume_reach], 
                               'Week_Day_Encoded': [weekDay_encoded], 
                               'Time_Band_Encoded': [Time_Band_encoded]})
    
    # make the prediction using the loaded model and input data
    predicted_unrolled_value = model1.predict(input_data)
    
    # return the predicted unrolled value as output
    return predicted_unrolled_value[0]

In [5]:
# Function calling
# 0.064741	2.991051	0.013427	41.62	41.619074	'Wednesday'	'18:30:00 - 19:00:00' --> test input data
# 5.781056 --> unrolled actual value

predict_unrolled_value(0.064741, 2.991051, 0.013427, 41.62, 41.619074, 'Wednesday', '18:30:00 - 19:00:00')

4.123954

In [6]:
# 0.152596	9.820626	0.043337	94.61	94.614234	1	'20:00:00 - 20:30:00'
# 12.150886
predict_unrolled_value(0.152596, 9.820626, 0.043337, 94.61, 94.614234, 'Monday', '20:00:00 - 20:30:00')

9.738856000000002

In [7]:
# 0.611246	4.196084	0.018516	36.23	36.231006	'Saturday'	''08:00:00 - 08:30:00''
# 3.711884
predict_unrolled_value(0.611246, 4.196084, 0.018516, 36.23, 36.23, 'Saturday', '08:00:00 - 08:30:00')

3.3215619