In [1]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-whitegrid')
from IPython.display import display, HTML, display_html
import seaborn as sns
import datetime
# Machine Learning Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
# Train-test split
from sklearn.model_selection import train_test_split
# Hyperparameter tuning
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# Metrics
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
# Additionals
from pprint import pprint
from math import sqrt
DataSet¶
In [2]:
df = pd.read_csv(r'C:\Users\Ankit Bansal\Downloads\final_data.csv')
df.head()
Out[2]:
In [3]:
df.shape
Out[3]:
Data Scaling¶
In [4]:
dfnew = df.drop(['DATEPRD' , 'NPD_WELL_BORE_CODE'], axis=1)
scaler_all_data = MinMaxScaler()
data_scaled = scaler_all_data.fit_transform(dfnew)
df_data_scaled=pd.DataFrame(data_scaled)
df_data_scaled.head(5)
Out[4]:
In [5]:
color = {'boxes': 'DarkGreen', 'whiskers': 'DarkOrange', 'medians': 'DarkBlue', 'caps': 'Gray'}
bp_scaled_all_data = df_data_scaled.plot.box(color=color, sym='r+' ,figsize=(30,10))
Oil & Gas Production Rate Prediction¶
In [6]:
dfnew= df.filter(['ON_STREAM_HRS','AVG_DOWNHOLE_PRESSURE','AVG_DP_TUBING','AVG_WHP_P','AVG_WHT_P','DP_CHOKE_SIZE','BORE_OIL_VOL', 'BORE_GAS_VOL'], axis=1)
dfnew.head(5)
Out[6]:
In [7]:
X = dfnew.filter(['DATEPRD','NPD_WELL_BORE_CODE','ON_STREAM_HRS', 'AVG_DOWNHOLE_PRESSURE', 'AVG_DP_TUBING', 'AVG_WHP_P', 'AVG_WHT_P', 'DP_CHOKE_SIZE'])
Y = dfnew[['BORE_OIL_VOL', 'BORE_GAS_VOL']]
print('Features shape:', X.shape)
print('Target shape', Y.shape)
Selection of data set¶
In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, shuffle = True)
Time=df.iloc[:,0]
Time_train, Time_test = train_test_split(Time, test_size=0.3, random_state=42)
In [9]:
# Random analysis to fit Random forest
rf = RandomForestRegressor(random_state=0).fit(X_train, y_train) #Random_state is the seed used by the random number generator
predictions_rf = rf.predict(X_test)
# Metrics
print('Model score:', round(rf.score(X_test, y_test),2))
print('Mean absolute error:', round(mean_absolute_error(y_test, predictions_rf),2))
print('Root mean squared error:', round(sqrt(mean_squared_error(y_test, predictions_rf)),2))
print('R2:', round(r2_score(y_test, predictions_rf),2))
Features importances¶
In [10]:
# List of features
feature_list = list(X_train.columns)
# Get numerical feature importances (Gini importance)
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
[print('Variable: {:30} Importance: {}'.format(*pair)) for pair in feature_importances];
In [11]:
# list of x locations for plotting
X_testues = list(range(len(importances)))
# Make a bar chart
plt.bar(X_testues, importances, orientation = 'vertical', color = 'b', edgecolor = 'k', linewidth = 1.2)
# Tick labels for x axis
plt.xticks(X_testues, feature_list, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');
In [12]:
# List of features sorted from most to least important
sorted_importances = [importance[1] for importance in feature_importances]
sorted_features = [importance[0] for importance in feature_importances]
# Cumulative importances
cumulative_importances = np.cumsum(sorted_importances)
# Make a line graph
plt.plot(X_testues, cumulative_importances, 'b-')
# Draw line at 95% of importance retained
plt.hlines(y = 0.95, xmin=0, xmax=len(sorted_importances), color = 'r', linestyles = 'dashed')
# Format x ticks and labels
plt.xticks(X_testues, sorted_features, rotation = 'vertical')
# Axis labels and title
plt.xlabel('Variable'); plt.ylabel('Cumulative Importance'); plt.title('Cumulative Importances');
In [13]:
# Find number of features for cumulative importance of 95%
# Add 1 because Python is zero-indexed
print('Number of features for 95% importance:', np.where(cumulative_importances > 0.95)[0][0] + 1)
In [14]:
# Extract the names of the most important features
important_feature_names = [feature[0] for feature in feature_importances[0:5]]
# Find the columns of the most important features
important_indices = [feature_list.index(feature) for feature in important_feature_names]
# # Create training and testing sets with only the important features
important_train_features = X_train.iloc[:, important_indices]
important_val_features = X_test.iloc[:, important_indices]
# # Sanity check on operations
print('Important train features shape:', important_train_features.shape)
print('Important val features shape:', important_val_features.shape)
Training and testing in features selected¶
Then, we evaluate on the test data set the performance using the important features.
In [15]:
# Train the expanded model on only the important features
rf.fit(important_train_features, y_train);
In [16]:
# Make predictions on validation data
predictions_import = rf.predict(important_val_features)
# Metrics
print('Mean absolute error:', round(mean_absolute_error(y_test, predictions_import),2))
print('Root mean squared error:', round(sqrt(mean_squared_error(y_test, predictions_import)),2))
print('R2:', round(r2_score(y_test, predictions_import),2))
print('Explained variance score:', explained_variance_score(y_test, predictions_import))
In [17]:
# We require to import the time library for the run time evaluation
import time
# All features training and testing time
all_features_time = []
# We decide to do 10 iterations and take average for all features
for _ in range(10):
start_time = time.time()
rf.fit(X_train, y_train)
all_features_predictions = rf.predict(X_test)
end_time = time.time()
all_features_time.append(end_time - start_time)
all_features_time = np.mean(all_features_time)
print('All features total training and testing time:', round(all_features_time, 2), 'seconds.')
In [18]:
# Time training and testing for reduced feature set
reduced_features_time = []
# We decide to do 10 iterations and take average for reduced features
for _ in range(10):
start_time = time.time()
rf.fit(important_train_features, y_train)
reduced_features_predictions = rf.predict(important_val_features)
end_time = time.time()
reduced_features_time.append(end_time - start_time)
reduced_features_time = np.mean(reduced_features_time)
print('Reduced features total training and testing time:', round(reduced_features_time, 2), 'seconds.')
In [19]:
# Mean absolute error
all_mean_absolute_error=mean_absolute_error(y_test, predictions_rf)
reduced_mean_absolute_error=mean_absolute_error(y_test, predictions_import)
# Root squared mean error
all_root_mean_squared_error=sqrt(mean_squared_error(y_test, predictions_rf))
reduced_root_mean_squared_error=sqrt(mean_squared_error(y_test, predictions_import))
# R2 score
all_r2_score=r2_score(y_test, predictions_rf)
reduced_r2_score=r2_score(y_test, predictions_import)
# Summarize the data in a Dataframe
comparison = pd.DataFrame({'Features': ['all (8)', 'reduced (5)'],
'R2': [round(all_r2_score, 2), round(reduced_r2_score, 2)],
'Mean absolute error': [round(all_mean_absolute_error, 2), round(reduced_mean_absolute_error, 2)],
'Root mean squared error': [round(all_root_mean_squared_error, 2), round(reduced_root_mean_squared_error, 2)],
'Run time': [round(all_features_time, 2), round(reduced_features_time, 2)]})
comparison[['Features','R2', 'Mean absolute error', 'Root mean squared error', 'Run time']]
Out[19]:
Examine the default Random Forest to determine parameters¶
In [20]:
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())
Defining hyperparameters¶
In [21]:
# Definition of specific parameters for Random forest
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 2, stop = 2000, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(4, 30, num = 2)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 4, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
pprint(random_grid)
Random search with cross validation¶
In cross validation, the training set is split into k smaller sets. We require to follow the following steps:
- A model is trained using $k-1$of the folds as training data (Block of the data to train).
- The resulting model is validated on the remaining part of the data (Test data). Therefore, we can test the best parameters in this part od the data.
- The performance measure reported by k-fold cross-validation is then the average of the values computed in the loop.
- The method is computationally expensive.
In [22]:
rf = RandomForestRegressor(random_state = 42)
# Use the random grid to search for best hyperparameters
# Random search of parameters, using 3 fold cross validation,
# search across different combinations.
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter = 15, scoring='neg_mean_absolute_error', cv = 3, verbose=2, random_state=42, n_jobs=-1, return_train_score=True)
# Fit the random search model
rf_random.fit(X_train, y_train);
Definition of best hyperparameters¶
In [23]:
# Obtaining the best parameters
rf_random.best_params_
Out[23]:
Evaluation of parameters in test set¶
In [24]:
best_random = rf_random.best_estimator_.fit(X_train, y_train)
predictions_best_random_test = best_random.predict(X_test)
predictions_best_random_train = best_random.predict(X_train)
print('Model score:', round(best_random.score(X_test, y_test),2))
print('Mean absolute error:', round(mean_absolute_error(y_test, predictions_best_random_test),2))
print('Root mean squared error:', round(sqrt(mean_squared_error(y_test, predictions_best_random_test)),2))
print('R2:', round(r2_score(y_test, predictions_best_random_test),2))
r2_rf=r2_score(y_test, predictions_best_random_test)
Mean_absolute_error_rf=mean_absolute_error(y_test, predictions_best_random_test)
Root_mean_squared_error_rf=sqrt(mean_squared_error(y_test, predictions_best_random_test))
Visualization of prediction with best hyperparameters¶
In [26]:
result_df = pd.DataFrame({'BORE_OIL_VOL': [y_test], 'Predicted': [predictions_best_random_test]})
result_df
Out[26]:
In [27]:
from scipy import stats
import statsmodels as sm
y = y_train
X_train_np = np.array(X_train)
y_np = np.array(y)
def abline(slope, intercept):
axes = plt.gca()
x_vals = np.array(axes.get_xlim())
y_vals = intercept + slope * x_vals
plt.plot(x_vals, y_vals, '-')
#predict y values for training data
y_hat = predictions_best_random_train
#plot predicted vs actual
plt.plot(y_hat,y_np,'o')
plt.xlabel('Predicted')#,color='white')
plt.ylabel('Actual')#,color='white')
plt.title('Predicted vs. Actual: Visual Linearity Test')#,color='white')
plt.tick_params(axis='x', colors='white')
plt.tick_params(axis='y', colors='white')
abline(1,0)
plt.show()
In [ ]: