# Importing necessary libraries
import numpy as np
import pandas as pd

from google.colab import files
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils import resample
import matplotlib.pyplot as plt
import folium
import seaborn as sns
import os
from xgboost import XGBClassifier

# This loop walks through the directory "/kaggle/input" and prints the full path of each file found
for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))  # Join and print the full file path

# Read the data from csv and load the data into a Pandas DataFrame

#uploaded = files.upload()

#data = pd.read_csv(next(iter(uploaded.keys())))
data = pd.read_csv("/content/sample_data/Significant Earthquake Dataset 1900-2023.csv", sep=',')
data.head()

print("Name of each column")
data.columns

Name of each column

Index(['Time', 'Place', 'Latitude', 'Longitude', 'Depth', 'Mag', 'MagType',
       'nst', 'gap', 'dmin', 'rms', 'net', 'ID', 'Updated', 'Unnamed: 14',
       'Type', 'horizontalError', 'depthError', 'magError', 'magNst', 'status',
       'locationSource', 'magSource'],
      dtype='object')

print("Data types of each column")
data.dtypes

Data types of each column

m = folium.Map(location=[0, 0], zoom_start=2)
for _, row in data.iterrows():
    folium.CircleMarker(location=[row["Latitude"], row["Longitude"]],
                        radius=2, color='blue', fill=True, fill_color='blue', fill_opacity=0.5).add_to(m)
m.save("earthquake_map.html")
m

plt.figure(figsize=(8, 6))
sns.histplot(data['Mag'], kde=True, bins=30, color='blue')
plt.title('Earthquake Magnitude Distribution')
plt.xlabel('Magnitude')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(8, 6))
sns.histplot(data['Depth'], kde=True, bins=30, color='green')
plt.title('Earthquake Depths Distribution')
plt.xlabel('Depth (km)')
plt.ylabel('Frequency')
plt.show()

# Magnitude vs Depth
plt.scatter(data['Depth'], data['Mag'], alpha=0.5, c=data['Mag'], cmap='viridis')
plt.title('Magnitude vs Depth of Earthquakes')
plt.xlabel('Depth (km)')
plt.ylabel('Magnitude')
plt.colorbar(label='Magnitude')
plt.show()

data['Year'] = pd.to_datetime(data['Time']).dt.year
yearly_counts = data['Year'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
plt.plot(yearly_counts.index, yearly_counts.values, marker='o', color='blue')
plt.title('Earthquake Frequency Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Earthquakes')
plt.grid(True)
plt.show()

top_places = data['Place'].value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_places.values, y=top_places.index, palette='magma')
plt.title('Top 10 Locations with Most Earthquakes')
plt.xlabel('Number of Earthquakes')
plt.ylabel('Place')
plt.show()

<ipython-input-11-ebf1db413615>:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_places.values, y=top_places.index, palette='magma')

plt.figure(figsize=(12, 8))
plt.scatter(data['Longitude'], data['Latitude'], s=data['Mag']**2, c=data['Depth'], cmap='cool', alpha=0.6)
plt.colorbar(label='Depth (km)')
plt.title('Bubble Plot of Earthquakes (Size = Magnitude)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(pd.to_datetime(data['Time']), data['Mag'], '.', alpha=0.5, markersize=2)
plt.title('Earthquake Magnitudes Over Time')
plt.xlabel('Time')
plt.ylabel('Magnitude')
plt.grid(True)
plt.show()

missing_values = data.isnull().sum() # we are calculating the number of missing values per features
missing_percentage = (missing_values / len(data)) * 100 # transforming it into % just to have a better vizualization
print(missing_values)
print(missing_percentage)

Time                   0
Place                284
Latitude               0
Longitude              0
Depth                134
Mag                    0
MagType                0
nst                29858
gap                27244
dmin               32936
rms                17113
net                    0
ID                     0
Updated                0
Unnamed: 14        37331
Type                   0
horizontalError    33361
depthError         16504
magError           20780
magNst             31959
status                 0
locationSource         0
magSource              0
dtype: int64
Time                 0.000000
Place                0.760762
Latitude             0.000000
Longitude            0.000000
Depth                0.358951
Mag                  0.000000
MagType              0.000000
nst                 79.981785
gap                 72.979561
dmin                88.226943
rms                 45.841258
net                  0.000000
ID                   0.000000
Updated              0.000000
Unnamed: 14        100.000000
Type                 0.000000
horizontalError     89.365407
depthError          44.209906
magError            55.664193
magNst              85.609815
status               0.000000
locationSource       0.000000
magSource            0.000000
dtype: float64

# Missing values
threshold = 0.5

data = data.dropna(thresh=int((1-threshold) * len(data)), axis=1) # we are dropping columns with more than 50% missing values
# for numerical data (their type is float64, we have already seen it previously), we are going to attribute the value equals to the median
for col in data.select_dtypes(include=['float64', 'int64']).columns: #searching in the numerical features
    if data[col].isnull().sum() > 0: #checking if this column is not null with the >0 condition
        median_value = data[col].median()
        data[col].fillna(median_value, inplace=True)
        print(f"Missing values in numeric column '{col}' filled with median: {median_value}")

# for categorical data like 'Place', we replace missing values with 'Unknown'
data['Place'] = data['Place'].fillna('Unknown')

Missing values in numeric column 'Depth' filled with median: 28.5
Missing values in numeric column 'rms' filled with median: 1.0
Missing values in numeric column 'depthError' filled with median: 6.1

<ipython-input-7-df00a431aef6>:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(median_value, inplace=True)
<ipython-input-7-df00a431aef6>:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col].fillna(median_value, inplace=True)
<ipython-input-7-df00a431aef6>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Place'] = data['Place'].fillna('Unknown')

data.describe() #just checking the values

missing_values = data.isnull().sum() # after applying the change, we are checking again if there are any missing values
print(missing_values)

Time              0
Place             0
Latitude          0
Longitude         0
Depth             0
Mag               0
MagType           0
rms               0
net               0
ID                0
Updated           0
Type              0
depthError        0
status            0
locationSource    0
magSource         0
dtype: int64

duplicates = data.duplicated().sum() #now, let's check if there are any duplicate rows, we have already seen the different subjects and so rows so in theory we will find 0
print("number of duplicate rows:", duplicates)

number of duplicate rows: 0

# we are now checking for outliers and we will use the Interquartile Range (IQR)
numeric_data = data.select_dtypes(include=['float64', 'int64']) # we are again working with the numerical data
print("Numeric Data:")
print(numeric_data.head()) # just showing the 5 first rows to have an idea of what we are working on
Q1 = numeric_data.quantile(0.25) # we are calculating the quantiles associated to the first 25% and 75% of data
Q3 = numeric_data.quantile(0.75)
IQR = Q3 - Q1 # IQR is just the difference between the quantile associated to 75% and the one to 25%
outliers = ((numeric_data < (Q1 - 1.5 * IQR)) | (numeric_data > (Q3 + 1.5 * IQR))) # defining the outliers with the condition
print("\nOutliers:\n", outliers)

Numeric Data:
   Latitude  Longitude   Depth  Mag   rms  depthError
0   -6.5986   132.0763  38.615  6.1  0.76       5.595
1  -15.0912   167.0294  36.029  5.6  0.94       6.080
2   12.3238   123.8662  20.088  6.1  0.54       4.399
3  -40.5465   174.5709  74.320  5.7  1.15       4.922
4   45.1126    23.1781  10.000  5.6  0.40       1.794

Outliers:
        Latitude  Longitude  Depth    Mag    rms  depthError
0         False      False  False  False   True       False
1         False      False  False  False   True       False
2         False      False  False  False   True       False
3         False      False  False  False   True       False
4         False      False  False  False   True        True
...         ...        ...    ...    ...    ...         ...
37326     False      False  False   True  False       False
37327     False      False  False   True  False       False
37328     False      False  False   True  False       False
37329     False      False  False  False  False       False
37330     False      False  False   True  False       False

[37331 rows x 6 columns]

# we used boxplots to vizualize our outliers for each numerical feature
plt.figure(figsize=(10, 10))
sns.boxplot(data=numeric_data, orient="h")
plt.title("Boxplots for Numeric Features")
plt.show()

# displaying the summary of the data we have just worked on
print("Statistical Summary (Numeric Data):")
data.describe()

Statistical Summary (Numeric Data):

# same but for categorical columns
print("Statistical Summary (Categorical Data):")
data.describe(include=['object', 'category'])

Statistical Summary (Categorical Data):

data = data.drop(columns=['ID', 'Updated'])
data

class CustomTransformation():

    def __init__(self):
        """
        Simple class to convert categorical variables into numeric indices,
        usable for a machine learning algorithm
        """
        # Here we will replace the property type, room, bed, city, and neighborhood with an index
        self.placefitted = False  # Indicates if fit_transform has been used, to avoid using transform without fit being called
        self.place2index = dict()  # Dictionary that will convert the name to an index
        self.placemax_index = 0  # Indicates the last index of the place.

        self.magtypefitted = False  # Indicates if fit_transform has been used, to avoid using transform without fit being called
        self.magtype2index = dict()  # Dictionary that will convert the name to an index
        self.magtypemax_index = 0  # Indicates the last index of the magtype.

        self.netfitted = False  # Indicates if fit_transform has been used, to avoid using transform without fit being called
        self.net2index = dict()  # Dictionary that will convert the name to an index
        self.netmax_index = 0  # Indicates the last index of the net.

        self.typefitted = False  # Indicates if fit_transform has been used, to avoid using transform without fit being called
        self.type2index = dict()  # Dictionary that will convert the name to an index
        self.typemax_index = 0  # Indicates the last index of the type.

        self.statusfitted = False  # Indicates if fit_transform has been used, to avoid using transform without fit being called
        self.status2index = dict()  # Dictionary that will convert the name to an index
        self.statusmax_index = 0  # Indicates the last index of the status.

        self.locationsourcefitted = False  # Indicates if fit_transform has been used, to avoid using transform without fit being called
        self.locationsource2index = dict()  # Dictionary that will convert the name to an index
        self.locationsourcemax_index = 0  # Indicates the last index of the locationsource.

        self.magsourcefitted = False  # Indicates if fit_transform has been used, to avoid using transform without fit being called
        self.magsource2index = dict()  # Dictionary that will convert the name to an index
        self.magsourcemax_index = 0  # Indicates the last index of the magsource.

    def fit_transform(self, dataset):

        self.placefitted = True
        self.magtypefitted = True
        self.netfitted = True
        self.typefitted = True
        self.statusfitted = True
        self.locationsourcefitted = True
        self.magsourcefitted = True

        # Retrieve the different columns to convert
        places = dataset["Place"].unique()
        print("Every place : ", places)
        self.place2index = {prop:i for (i, prop) in enumerate(places)}
        self.placemax_index = max(list(self.place2index.values()))

        magtypes = dataset["MagType"].unique()
        print("Every magtype : ", magtypes)
        self.magtype2index = {prop:i for (i, prop) in enumerate(magtypes)}
        self.magtypemax_index = max(list(self.magtype2index.values()))

        nets = dataset["net"].unique()
        print("Every net : ", nets)
        self.net2index = {prop:i for (i, prop) in enumerate(nets)}
        self.netmax_index = max(list(self.net2index.values()))

        types = dataset["Type"].unique()
        print("Every type of earthquakes : ", types)
        self.type2index = {prop:i for (i, prop) in enumerate(types)}
        self.typemax_index = max(list(self.type2index.values()))

        status = dataset["status"].unique()
        print("Every status : ", status)
        self.status2index = {prop:i for (i, prop) in enumerate(status)}
        self.statusmax_index = max(list(self.status2index.values()))

        locationsources = dataset["locationSource"].unique()
        print("Every location source : ", locationsources)
        self.locationsource2index = {prop:i for (i, prop) in enumerate(locationsources)}
        self.locationsourcemax_index = max(list(self.locationsource2index.values()))

        magsources = dataset["magSource"].unique()
        print("Every magsource : ", magsources)
        self.magsource2index = {prop:i for (i, prop) in enumerate(magsources)}
        self.magsourcemax_index = max(list(self.magsource2index.values()))

        # Fonction transform
        return self.transform(dataset)

    def transform(self, dataset):

        # Transform into indices
        dataset.loc[:, "Place"] = dataset["Place"].replace(self.place2index)
        dataset.loc[:, "MagType"] = dataset["MagType"].replace(self.magtype2index)
        dataset.loc[:, "net"] = dataset["net"].replace(self.net2index)
        dataset.loc[:, "Type"] = dataset["Type"].replace(self.type2index)
        dataset.loc[:, "status"] = dataset["status"].replace(self.status2index)
        dataset.loc[:, "locationSource"] = dataset["locationSource"].replace(self.locationsource2index)
        dataset.loc[:, "magSource"] = dataset["magSource"].replace(self.magsource2index)

        dataset[dataset.Place.isna()] = self.placemax_index + 1
        dataset[dataset.MagType.isna()] = self.magtypemax_index + 1
        dataset[dataset.net.isna()] = self.netmax_index + 1
        dataset[dataset.Type.isna()] = self.typemax_index + 1
        dataset[dataset.status.isna()] = self.statusmax_index + 1
        dataset[dataset.locationSource.isna()] = self.locationsourcemax_index + 1
        dataset[dataset.magSource.isna()] = self.magsourcemax_index + 1

        return dataset

# Cell that may take some time (~2 mins)
features_transformer = CustomTransformation()
data = features_transformer.fit_transform(data.copy())

Every place :  ['130 km SW of Tual, Indonesia' '7 km SW of Port-Olry, Vanuatu'
 'Masbate region, Philippines' ... '221 km SW of Nikolski, Alaska'
 '12 km NNW of Parkfield, California' '16 km SW of Old Harbor, Alaska']
Every magtype :  ['mww' 'mb' 'Mi' 'mwc' 'mw' 'mwb' 'mwr' 'ml' 'ms_20' 'mwp' 'Ml' 'ms' 'md'
 'mh' 'uk' 'fa' 'lg' 'mint']
Every net :  ['us' 'pt' 'nc' 'pr' 'ak' 'hv' 'ci' 'nn' 'uu' 'official' 'iscgem' 'se'
 'gcmt' 'uw' 'iscgemsup' 'ushis']
Every type of earthquakes :  ['earthquake' 'volcanic eruption' 'nuclear explosion' 'explosion']
Every status :  ['reviewed' 'automatic']
Every location source :  ['us' 'pt' 'nc' 'pr' 'ak' 'hv' 'ci' 'nn' 'uu' 'pgc' 'tul' 'ath' 'guc'
 'unm' 'the' 'teh' 'us_wel' 'wel' 'aeic' 'thr' 'lim' 'gcmt' 'tap' 'rom'
 'isk' 'iscgem' 'se' 'ucr' 'beo' 'rspr' 'mdd' 'sja' 'car' 'jma' 'csem'
 'casc' 'uw' 'ren' 'spe' 'doe' 'ott' 'ags' 'bou' 'brk' 'a' 'b' 'e' 'ag'
 'g' 'u' 'h' 'iscgemsup' 'ushis' 'official']
Every magsource :  ['us' 'pt' 'nc' 'pr' 'ak' 'guc' 'hv' 'ci' 'nn' 'uu' 'gcmt' 'us_gcmt'
 'us_pgc' 'pgc' 'official' 'iscgem' 'se' 'nied' 'hrv' 'duputel' 'uw' 'par'
 '1023' '1009' '1000' 'brk' '1020' 'iscgemsup' 'isc' 'pas' 'pal' 'mos'
 'rot' 'mat' 'dor' 'hvo' 'upp' 'mmt' 'epb' 'vic' 'gr' 'hht' 'esm' 'wy'
 'woo' 'jon' 'cfr' 'sjg' 'abe' 'ntt' 'cdmg' 'nqt' 'dda' 'ell']

<ipython-input-14-a1cac74fff86>:89: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "Place"] = dataset["Place"].replace(self.place2index)
<ipython-input-14-a1cac74fff86>:90: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "MagType"] = dataset["MagType"].replace(self.magtype2index)
<ipython-input-14-a1cac74fff86>:91: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "net"] = dataset["net"].replace(self.net2index)
<ipython-input-14-a1cac74fff86>:92: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "Type"] = dataset["Type"].replace(self.type2index)
<ipython-input-14-a1cac74fff86>:93: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "status"] = dataset["status"].replace(self.status2index)
<ipython-input-14-a1cac74fff86>:94: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "locationSource"] = dataset["locationSource"].replace(self.locationsource2index)
<ipython-input-14-a1cac74fff86>:95: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "magSource"] = dataset["magSource"].replace(self.magsource2index)

# Convert the 'Time' column to datetime format
data['Time'] = pd.to_datetime(data['Time'], format='%Y-%m-%dT%H:%M:%S.%fZ')

# Convert the 'Time' column to a string in the 'YYYYMMDD' format and then cast it to an integer type.
data['Time'] = data['Time'].dt.strftime('%Y%m%d').astype(int)

# Display the dataset after all changes
data.head()

data.dtypes

# Target Variable (Dependent Variable):
# Magnitude (Mag): The magnitude of the earthquake is a critical variable for earthquake prediction. This variable could serve as the target or dependent variable, especially for regression models.
# Alternatively, it could be categorized into bins (e.g., low, medium, high magnitude) if you are working with a classification model.
# Depth (Depth): The depth of the earthquake can also be an important factor in predicting the impact of an earthquake.
# We can choose one or both as target variables depending on the prediction model we're developing (regression or classification).

##### Feature Variables (Independent Variables):
# (Mag/Depth) if not the target variable
# Time
# Place
# Latitude
# Longitude

# data = data[['Time', 'Place', 'Latitude', 'Longitude', 'Depth', 'Mag']]

# correlation matrix using seaborn heatmap
numeric_data = data.select_dtypes(include=['number'])
# we are computing the correlation matrix for numeric columns only
corr_matrix = numeric_data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

##### Dimensionality Reduction

# Liste des colonnes que vous voulez conserver
cols_to_keep = ['Mag', 'Time', 'Place', 'Latitude', 'Longitude', 'Depth', 'MagType']

# Garder uniquement les colonnes spécifiées
data = data[cols_to_keep]

# Identifier les colonnes à supprimer
cols_to_drop = [col for col in data.columns if col not in ['Mag', 'Time', 'Place', 'Latitude', 'Longitude', 'Depth', 'MagType']]

# Supprimer les colonnes identifiées
data = data.drop(columns=cols_to_drop)

X = data.drop(columns=['Mag'])
y = data['Mag']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42) #we are splitting the data into train and test with 30% of test, as usual
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) #splitting the temp set to create validation (15%) and test (15%) datasets
print("Training set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)

Training set: (26131, 6) (26131,)
Validation set: (5600, 6) (5600,)
Test set: (5600, 6) (5600,)

# Feature Scaling (Standardization)
# scaler = StandardScaler()

# Fit scaler on the training data and transform
# X_train_scaled = scaler.fit_transform(X_train)

# Use the same scaler to transform the test data
# X_test_scaled = scaler.transform(X_test)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

X = data.drop(columns=['Mag'])  # Features
y = data['Mag']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model using RMSE and R^2 score
rmse_linear_reg = np.sqrt(mean_squared_error(y_test, y_pred))
r2_linear_reg = r2_score(y_test, y_pred)

# Print results
print(f"Root Mean Squared Error (RMSE): {rmse_linear_reg}")
print(f"R^2 Score: {r2_linear_reg}")

Root Mean Squared Error (RMSE): 0.44336309012136116
R^2 Score: 0.06837259025676035

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = data.drop(columns=['Mag'])
y = data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Binary classification target: 1 for Mag >= 6.0

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize model
dt_model = DecisionTreeClassifier(random_state=42)

# Train model
dt_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_model.predict(X_test)

accuracy_linear_reg = accuracy_score(y_test, y_pred_dt)
# Evaluate
print(f"Accuracy: {accuracy_linear_reg}")
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))

Accuracy: 0.6425
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.70      0.71      7104
           1       0.51      0.53      0.52      4096

    accuracy                           0.64     11200
   macro avg       0.62      0.62      0.62     11200
weighted avg       0.65      0.64      0.64     11200

from sklearn.linear_model import LogisticRegression

X = data.drop(columns=['Mag'])
y = data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Binary classification target: 1 for Mag >= 6.0

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the pipeline with Logistic Regression
pipeline_lr = Pipeline(steps=[
    ('model', LogisticRegression(random_state=42))
])

# Fit the pipeline on the training data
pipeline_lr.fit(X_train, y_train)

# Make predictions
y_pred_lr = pipeline_lr.predict(X_test)

acccuracy_logistic_reg = accuracy_score(y_test, y_pred_lr)

# Evaluate the model
print("Logistic Regression Results")
print(f"Accuracy: {acccuracy_logistic_reg}")
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))

Logistic Regression Results
Accuracy: 0.6604464285714285
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.98      0.79      7104
           1       0.74      0.11      0.19      4096

    accuracy                           0.66     11200
   macro avg       0.70      0.54      0.49     11200
weighted avg       0.69      0.66      0.57     11200

/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = data.drop(columns=['Mag'])
y = data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Binary classification target: 1 for Mag >= 6.0

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize model
rf = RandomForestClassifier(random_state=42)

# Train model
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Evaluate
print(f"Accuracy: {accuracy_rf}")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

Accuracy: 0.7008035714285714
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.83      0.78      7104
           1       0.62      0.48      0.54      4096

    accuracy                           0.70     11200
   macro avg       0.68      0.65      0.66     11200
weighted avg       0.69      0.70      0.69     11200

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = data.drop(columns=['Mag'])
y = data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Binary classification target: 1 for Mag >= 6.0

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize model
dt_model = DecisionTreeClassifier(random_state=42)

# Train model
dt_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_model.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Evaluate
print(f"Accuracy: {accuracy_dt}")
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))

Accuracy: 0.6425
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.70      0.71      7104
           1       0.51      0.53      0.52      4096

    accuracy                           0.64     11200
   macro avg       0.62      0.62      0.62     11200
weighted avg       0.65      0.64      0.64     11200

# cell that takes a long time to execute (~6mins)
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 10))
plot_tree(dt_model, feature_names=X_train.columns, filled=True, rounded=True)
plt.show()

from sklearn.neighbors import KNeighborsClassifier

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()) # standardize features
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # encode categorical variables
])

numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

pipeline_knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', KNeighborsClassifier(n_neighbors=5))
])

# Fit the pipeline on the training data
pipeline_knn.fit(X_train, y_train)

# Make predictions
y_pred_knn = pipeline_knn.predict(X_test)

accuracy_knn = accuracy_score(y_test, y_pred_knn)

# Evaluate the model
print("K-Nearest Neighbors Results")
print(f"Accuracy: {accuracy_knn}")
print("Classification Report:")
print(classification_report(y_test, y_pred_knn))

K-Nearest Neighbors Results
Accuracy: 0.6742857142857143
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.80      0.76      7104
           1       0.57      0.45      0.50      4096

    accuracy                           0.67     11200
   macro avg       0.64      0.63      0.63     11200
weighted avg       0.66      0.67      0.66     11200

from sklearn.svm import SVC

X = data.drop(columns=['Mag'])
y = data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Binary classification target: 1 for Mag >= 6.0

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define the pipeline with Support Vector Machine
pipeline_svm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SVC(kernel='rbf', random_state=42))
])

# Fit the pipeline on the training data
pipeline_svm.fit(X_train, y_train)

# Make predictions
y_pred_svm = pipeline_svm.predict(X_test)

accuracy_svm = accuracy_score(y_test, y_pred_svm)

# Evaluate the model
print("SVM Classifier Results")
print(f"Accuracy: {accuracy_svm}")
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))

SVM Classifier Results
Accuracy: 0.6978571428571428
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.92      0.80      7104
           1       0.70      0.30      0.42      4096

    accuracy                           0.70     11200
   macro avg       0.70      0.61      0.61     11200
weighted avg       0.70      0.70      0.66     11200

# cell that takes a lot of time to execute (~4 mins)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

X = data.drop(columns=['Mag'])
y = data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Binary classification target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

param_grid = {
    'n_estimators': [100, 200],  # Number of trees
    'max_depth': [10, 20],       # Depth of each tree
    'min_samples_split': [2, 5], # Minimum samples required to split
    'min_samples_leaf': [1, 2],  # Minimum samples required in leaf nodes
    'bootstrap': [True]          # Whether to use bootstrap sampling
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=3, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Make predictions with the tuned model
y_pred_rf = best_rf.predict(X_test)

accuracy_best_rf = accuracy_score(y_test, y_pred_rf)

# Evaluate
print(f"Best Parameters: {best_params}")
print(f"Accuracy: {accuracy_best_rf}")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.7175892857142857
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.93      0.81      7104
           1       0.74      0.35      0.48      4096

    accuracy                           0.72     11200
   macro avg       0.73      0.64      0.64     11200
weighted avg       0.72      0.72      0.69     11200

# Identifying Relevant Predictors
correlation_matrix = X.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

# Recursive Feature Elimination (RFE)
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
rfe = RFE(model, n_features_to_select=5)
fit = rfe.fit(X_train, y_train)
print(f"Selected Features: {X_train.columns[fit.support_]}")

Selected Features: Index(['Time', 'Place', 'Latitude', 'Longitude', 'Depth'], dtype='object')

# Feature Importance from Tree-Based Models
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
feature_importances = model.feature_importances_
sorted_idx = feature_importances.argsort()
plt.barh(X_train.columns[sorted_idx], feature_importances[sorted_idx])
plt.xlabel("Feature Importance")
plt.show()

# Creating New Features
# Interaction Features
X['Lat_long_interaction'] = X['Latitude'] * X['Longitude']

# Binning
X['Depth_binned'] = pd.cut(X['Depth'], bins=[0, 10, 50, 100, 500, 1000], labels=['Very Shallow', 'Shallow', 'Moderate', 'Deep', 'Very Deep'])

from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training set
X_res, y_res = smote.fit_resample(X_train, y_train)

# Check the class distribution after SMOTE
print("Class distribution after SMOTE:")
print(y_res.value_counts())

Class distribution after SMOTE:
Mag
1    16384
0    16384
Name: count, dtype: int64

from imblearn.under_sampling import RandomUnderSampler

# Initialize RandomUnderSampler
undersampler = RandomUnderSampler(random_state=42)

# Apply undersampling to the training set
X_res, y_res = undersampler.fit_resample(X_train, y_train)

# Check the class distribution after undersampling
print("Class distribution after undersampling:")
print(y_res.value_counts())

Class distribution after undersampling:
Mag
0    9747
1    9747
Name: count, dtype: int64

from sklearn.ensemble import RandomForestClassifier

# Initialize RandomForestClassifier with class_weight='balanced'
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# Train the model with the balanced class weights
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_test)

accuracy_rf_balanced = accuracy_score(y_test, y_pred_rf)
# Evaluate the model
print(f"Accuracy: {accuracy_rf_balanced}")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

Accuracy: 0.7007142857142857
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.83      0.78      7104
           1       0.62      0.48      0.54      4096

    accuracy                           0.70     11200
   macro avg       0.68      0.65      0.66     11200
weighted avg       0.69      0.70      0.69     11200

X = data.drop(columns=['Mag'])
y = data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Binary classification target: 1 for Mag >= 6.0

# Identify column types
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler())                  # Standardize features
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # Encode categorical variables
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define the pipeline with XGBoost
pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the pipeline on the training data
pipeline_xgb.fit(X_train, y_train)

# Make predictions
y_pred_xgb = pipeline_xgb.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# Evaluate the model
print("XGBoost Classifier Results")
print(f"Accuracy: {accuracy_xgb}")
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb))

XGBoost Classifier Results
Accuracy: 0.7169642857142857
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.92      0.81      7104
           1       0.73      0.36      0.48      4096

    accuracy                           0.72     11200
   macro avg       0.72      0.64      0.64     11200
weighted avg       0.72      0.72      0.69     11200

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

X = data.drop(columns=['Mag'])
y = data['Mag']

# Convert categorical columns to numeric with LabelEncoder
categorical_cols = X.select_dtypes(include='object').columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define the model parameters
params = {
    'objective': 'regression',  # Regression
    'metric': 'rmse',           # Cost function: RMSE
    'boosting_type': 'gbdt',    # Gradient Boosting Decision Trees
    'num_leaves': 31,           # Maximum number of leaves in each tree
    'learning_rate': 0.05,      # Learning rate
    'feature_fraction': 0.9     # Fraction of features used in each iteration
}

# Train the model manually, with early stopping check
num_round = 100
best_rmse = float('inf')
early_stopping_rounds = 10
best_iteration = 0
iterations_without_improvement = 0

for i in range(num_round):
    model = lgb.train(
        params,
        train_data,
        num_boost_round=i + 1,
        valid_sets=[train_data, test_data],
        valid_names=['train', 'test']
    )

    # Get the RMSE of the latest iteration on the test data
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # Early stopping check
    if rmse < best_rmse:
        best_rmse = rmse
        best_iteration = i + 1
        iterations_without_improvement = 0
    else:
        iterations_without_improvement += 1

    if iterations_without_improvement >= early_stopping_rounds:
        print(f"Early stopping at iteration {i + 1} due to no improvement.")
        break

# Final model training
final_model = lgb.train(
    params,
    train_data,
    num_boost_round=best_iteration,
    valid_sets=[train_data, test_data],
    valid_names=['train', 'test']
)

# Make predictions
y_pred = final_model.predict(X_test, num_iteration=best_iteration)

# Calculate the RMSE
rmse_lightgbm = mean_squared_error(y_test, y_pred, squared=False)

# Calculate the R² score
r2_lightgbm = r2_score(y_test, y_pred)

# Display the results
print(f"RMSE: {rmse_lightgbm}")
print(f"R² Score: {r2_lightgbm}")

/usr/local/lib/python3.10/dist-packages/dask/dataframe/__init__.py:42: FutureWarning: 
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

  warnings.warn(msg, FutureWarning)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001971 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002102 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001960 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001914 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001989 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877

/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001668 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002019 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001912 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001950 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004258 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877

/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002003 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001953 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005379 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877

/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001915 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001970 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877

/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005637 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001950 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001958 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 29864, number of used features: 6
[LightGBM] [Info] Start training from score 5.948877

!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Requirement already satisfied: graphviz in /usr/local/lib/python3.10/dist-packages (from catboost) (0.20.3)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from catboost) (3.8.0)
Requirement already satisfied: numpy<2.0,>=1.16.0 in /usr/local/lib/python3.10/dist-packages (from catboost) (1.26.4)
Requirement already satisfied: pandas>=0.24 in /usr/local/lib/python3.10/dist-packages (from catboost) (2.2.2)
Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from catboost) (1.13.1)
Requirement already satisfied: plotly in /usr/local/lib/python3.10/dist-packages (from catboost) (5.24.1)
Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from catboost) (1.17.0)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2024.2)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (1.3.1)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (4.55.2)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (1.4.7)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (24.2)
Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (11.0.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (3.2.0)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from plotly->catboost) (9.0.0)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 98.7/98.7 MB 7.8 MB/s eta 0:00:00
Installing collected packages: catboost
Successfully installed catboost-1.2.7

import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = data.drop(columns=['Mag'])
y = data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Binary classification target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize CatBoost Classifier
catboost_model = CatBoostClassifier(
    iterations=500,   # Number of boosting iterations
    learning_rate=0.1,  # Learning rate
    depth=6,  # Depth of the trees
    cat_features=[col for col in X.select_dtypes(include=['object']).columns],  # Categorical columns
    verbose=100  # Print progress every 100 iterations
)

# Fit the model
catboost_model.fit(X_train, y_train)

# Make predictions
y_pred_catboost = catboost_model.predict(X_test)

accuracy_catboost = accuracy_score(y_test, y_pred_catboost)

# Evaluate performance
print(f"Accuracy: {accuracy_catboost}")
print("Classification Report:")
print(classification_report(y_test, y_pred_catboost))

0:	learn: 0.6766978	total: 247ms	remaining: 2m 3s
100:	learn: 0.5610198	total: 8.55s	remaining: 33.8s
200:	learn: 0.5402910	total: 18.8s	remaining: 28s
300:	learn: 0.5276883	total: 26.4s	remaining: 17.4s
400:	learn: 0.5173725	total: 36.7s	remaining: 9.05s
499:	learn: 0.5081135	total: 45.1s	remaining: 0us
Accuracy: 0.7208035714285714
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.90      0.80      7104
           1       0.70      0.42      0.52      4096

    accuracy                           0.72     11200
   macro avg       0.71      0.66      0.66     11200
weighted avg       0.72      0.72      0.70     11200

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

X = data.drop(columns=['Mag'])
y = data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)

# Preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize DNN model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Make predictions
y_pred_dnn = (model.predict(X_test) > 0.5).astype(int)

accuracy_dnn = accuracy_score(y_test, y_pred_dnn)

# Evaluate
print(f"Accuracy: {accuracy_dnn}")
print("Classification Report:")
print(classification_report(y_test, y_pred_dnn))

/usr/local/lib/python3.10/dist-packages/keras/src/layers/core/dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)

Epoch 1/10
817/817 ━━━━━━━━━━━━━━━━━━━━ 14s 12ms/step - accuracy: 0.5330 - loss: 121441.3359 - val_accuracy: 0.6343 - val_loss: 37483.0234
Epoch 2/10
817/817 ━━━━━━━━━━━━━━━━━━━━ 5s 6ms/step - accuracy: 0.5404 - loss: 20474.8438 - val_accuracy: 0.3657 - val_loss: 4154.3105
Epoch 3/10
817/817 ━━━━━━━━━━━━━━━━━━━━ 7s 8ms/step - accuracy: 0.5322 - loss: 20850.3926 - val_accuracy: 0.3657 - val_loss: 3909.0786
Epoch 4/10
817/817 ━━━━━━━━━━━━━━━━━━━━ 10s 8ms/step - accuracy: 0.5390 - loss: 14796.0938 - val_accuracy: 0.6343 - val_loss: 10607.5850
Epoch 5/10
817/817 ━━━━━━━━━━━━━━━━━━━━ 11s 9ms/step - accuracy: 0.5395 - loss: 13382.0098 - val_accuracy: 0.3657 - val_loss: 20683.4863
Epoch 6/10
817/817 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - accuracy: 0.5371 - loss: 12430.1328 - val_accuracy: 0.6522 - val_loss: 2563.9446
Epoch 7/10
817/817 ━━━━━━━━━━━━━━━━━━━━ 3s 3ms/step - accuracy: 0.5413 - loss: 14500.2275 - val_accuracy: 0.4873 - val_loss: 2359.9150
Epoch 8/10
817/817 ━━━━━━━━━━━━━━━━━━━━ 4s 5ms/step - accuracy: 0.5454 - loss: 12087.7842 - val_accuracy: 0.3657 - val_loss: 14181.4229
Epoch 9/10
817/817 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.5357 - loss: 9927.2852 - val_accuracy: 0.6343 - val_loss: 13280.9717
Epoch 10/10
817/817 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - accuracy: 0.5509 - loss: 9501.2607 - val_accuracy: 0.6629 - val_loss: 1854.6089
350/350 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step
Accuracy: 0.6628571428571428
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.91      0.77      7104
           1       0.60      0.23      0.33      4096

    accuracy                           0.66     11200
   macro avg       0.64      0.57      0.55     11200
weighted avg       0.65      0.66      0.61     11200

from sklearn.ensemble import GradientBoostingClassifier

X = data.drop(columns=['Mag'])
y = data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Binary classification target: 1 for Mag >= 6.0

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the pipeline with Gradient Boosting
pipeline_gb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', GradientBoostingClassifier(n_estimators=100, random_state=42))
])

# Fit the pipeline on the training data
pipeline_gb.fit(X_train, y_train)

# Make predictions
y_pred_gb = pipeline_gb.predict(X_test)

accuracy_gb = accuracy_score(y_test, y_pred_gb)

# Evaluate the model
print("Gradient Boosting Classifier Results")
print(f"Accuracy: {accuracy_gb}")
print("Classification Report:")
print(classification_report(y_test, y_pred_gb))

Gradient Boosting Classifier Results
Accuracy: 0.7126785714285714
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.93      0.80      7104
           1       0.73      0.34      0.46      4096

    accuracy                           0.71     11200
   macro avg       0.72      0.63      0.63     11200
weighted avg       0.72      0.71      0.68     11200

# Adressing class imbalance using SMOTE
from sklearn.ensemble import GradientBoostingClassifier

X = data.drop(columns=['Mag'])
y = data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Binary classification target: 1 for Mag >= 6.0

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Class Imbalance Handling using SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Define the pipeline with Gradient Boosting
pipeline_gb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', GradientBoostingClassifier(n_estimators=100, random_state=42))
])

# Fit the pipeline on the training data
pipeline_gb.fit(X_train_res, y_train_res)

# Make predictions
y_pred_gb = pipeline_gb.predict(X_test)

accuracy_gb_smote = accuracy_score(y_test, y_pred_gb)

# Evaluate the model
print("Gradient Boosting Classifier Results")
print(f"Accuracy: {accuracy_gb_smote}")
print("Classification Report:")
print(classification_report(y_test, y_pred_gb))

Gradient Boosting Classifier Results
Accuracy: 0.6714285714285714
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.71      0.73      7104
           1       0.55      0.61      0.58      4096

    accuracy                           0.67     11200
   macro avg       0.65      0.66      0.65     11200
weighted avg       0.68      0.67      0.67     11200

# importing necessary libraries
import numpy as np
import pandas as pd

from google.colab import files
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils import resample
import matplotlib.pyplot as plt
import folium
import seaborn as sns
import os
from xgboost import XGBClassifier

# This loop walks through the directory "/kaggle/input" and prints the full path of each file found
for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))  # Join and print the full file path

#uploaded = files.upload()

#data = pd.read_csv(next(iter(uploaded.keys())))
data = pd.read_csv("/content/sample_data/Significant Earthquake Dataset 1900-2023.csv", sep=',')
data.head()

import pandas as pd

numeric_data = data.select_dtypes(include=['float64', 'int64']) # select numerical columns

Q1 = numeric_data.quantile(0.25)  # 25th percentile
Q3 = numeric_data.quantile(0.75)  # 75th percentile
IQR = Q3 - Q1                     # interquartile range

# define outliers using IQR rule
outliers = (numeric_data < (Q1 - 1.5 * IQR)) | (numeric_data > (Q3 + 1.5 * IQR)) # number of outliers per features
outlier_counts = outliers.sum()

outliers_percentage = (outlier_counts / len(data)) * 100 # % of outliers per features

print("Number of outliers per column:\n")
print(outlier_counts, "\n")
print("Percentage of outliers per column:\n")
print(outliers_percentage)

Number of outliers per column:

Latitude              0
Longitude             0
Depth              5168
Mag                1580
nst                  41
gap                 520
dmin                340
rms                 645
Unnamed: 14           0
horizontalError     164
depthError          339
magError           1296
magNst              390
dtype: int64 

Percentage of outliers per column:

Latitude            0.000000
Longitude           0.000000
Depth              13.843722
Mag                 4.232407
nst                 0.109828
gap                 1.392944
dmin                0.910771
rms                 1.727787
Unnamed: 14         0.000000
horizontalError     0.439313
depthError          0.908092
magError            3.471646
magNst              1.044708
dtype: float64

# we used boxplots to vizualize our outliers for Depth, Mag, and magError

plt.figure(figsize=(10, 8))

plt.subplot(3, 1, 1)
sns.boxplot(x=data['Depth'], orient='h', color='#87CEEB')
plt.title('Boxplot for Depth')
plt.xlabel('Depth (km)')

plt.subplot(3, 1, 2)
sns.boxplot(x=data['Mag'], orient='h', color='#DC143C')
plt.title('Boxplot for Magnitude')
plt.xlabel('Magnitude')

plt.subplot(3, 1, 3)
sns.boxplot(x=data['magError'], orient='h', color='#228B22')
plt.title('Boxplot for Magnitude Error')
plt.xlabel('Magnitude Error')

plt.tight_layout()

plt.show()

data[['Depth', 'Mag', 'magError']].describe()

# drop rows with negative depth
def handle_outliers(df):
    data_negative_depths = df[df['Depth'] < 0]
    # print("Rows with negative depths:")
    # data_negative_depths
    df = df[df['Depth'] >= 0]
    return df

print("Data after removing rows with negative depths:")
data_outliers = handle_outliers(data.copy())
data_outliers[['Depth', 'Mag', 'magError']].describe()

Data after removing rows with negative depths:

# let's check which columns have missing values
missing_values = data.isnull().sum()
missing_percentage = (missing_values / len(data)) * 100
print(missing_values)
print(missing_percentage)

Time                   0
Place                284
Latitude               0
Longitude              0
Depth                134
Mag                    0
MagType                0
nst                29858
gap                27244
dmin               32936
rms                17113
net                    0
ID                     0
Updated                0
Unnamed: 14        37331
Type                   0
horizontalError    33361
depthError         16504
magError           20780
magNst             31959
status                 0
locationSource         0
magSource              0
dtype: int64
Time                 0.000000
Place                0.760762
Latitude             0.000000
Longitude            0.000000
Depth                0.358951
Mag                  0.000000
MagType              0.000000
nst                 79.981785
gap                 72.979561
dmin                88.226943
rms                 45.841258
net                  0.000000
ID                   0.000000
Updated              0.000000
Unnamed: 14        100.000000
Type                 0.000000
horizontalError     89.365407
depthError          44.209906
magError            55.664193
magNst              85.609815
status               0.000000
locationSource       0.000000
magSource            0.000000
dtype: float64

def handle_missing_values(df):
    # we consider that the data is not important if the place is unknown
    df = df.dropna(subset=['Place'])

    # we will drop Unnamed:14 column because it is 100% empty
    df = df.drop(columns=['Unnamed: 14'])

    # handle missing values in numeric columns
    for col in df.select_dtypes(include=['number']).columns:
        # for numeric columns, replace NaNs with the median
        median_value = df[col].median()
        df[col].fillna(median_value, inplace=True)

    # handle missing values in non-numeric columns
    for col in df.select_dtypes(exclude=['number']).columns:
        # For non-numeric columns, fill NaNs with a placeholder
        if df[col].dtype == 'O':
            mode_value = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
            df[col].fillna(mode_value, inplace=True)
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            df[col].fillna('MissingDate', inplace=True)

    return df

# apply the function to handle missing values
data_outliers = handle_outliers(data.copy())
data_missing_values = handle_missing_values(data_outliers.copy())

<ipython-input-201-f24cd1833a9b>:12: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_value, inplace=True)
<ipython-input-201-f24cd1833a9b>:19: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace=True)

# we make sure that there is no missing values left
missing_values = data_missing_values.isnull().sum()
print(missing_values)

Time               0
Place              0
Latitude           0
Longitude          0
Depth              0
Mag                0
MagType            0
nst                0
gap                0
dmin               0
rms                0
net                0
ID                 0
Updated            0
Type               0
horizontalError    0
depthError         0
magError           0
magNst             0
status             0
locationSource     0
magSource          0
dtype: int64

# new features from Time column

def new_time_features(df):
    # extract hour, day, month, and year
    df['Time'] = pd.to_datetime(df['Time'])
    df['Hour'] = df['Time'].dt.hour
    df['Day'] = df['Time'].dt.day
    df['Month'] = df['Time'].dt.month

    # extract season
    df['Season'] = df['Month'].map({
        12: '0', 1: '0', 2: '0',  # 0 for winter
        3: '1', 4: '1', 5: '1',   # 1 for spring
        6: '2', 7: '2', 8: '2',   # 2 for summer
        9: '3', 10: '3', 11: '3'  # 3 for winter
    })

    # elapsed time since last earthquake
    df = df.sort_values(by=['Place', 'Time'])
    df['ElapsedTime'] = df.groupby('Place')['Time'].diff().dt.total_seconds() / 3600
    # handle missing values (for example, first earthquake in the region)
    df['ElapsedTime'] = df['ElapsedTime'].fillna(0)

    return df

data_time = new_time_features(data.copy())
data_time.tail()

# new geological features
from sklearn.cluster import KMeans

def new_geo_features(df):
    # regional clustering using k-means model
    coords = df[['Latitude', 'Longitude']]
    kmeans = KMeans(n_clusters=5, random_state=0)
    df['RegionCluster'] = kmeans.fit_predict(coords)

    return df

data_geo = new_geo_features(data.copy())
data_geo.head()

# seismicity (nst, gap, net, dmin, rms)
# interaction between variables
def new_seismicity_features(df):
    df['nstxgap'] = df['nst'] * df['gap']
    df['nstxdmin'] = df['nst'] * df['dmin']
    return df

data_seismicity = new_seismicity_features(data.copy())
data_seismicity.head()

# errors (horizontalError, depthError, magError)
def new_error_features(df):
    df['totalError'] = df['horizontalError'] + df['depthError'] + df['magError']
    return df

data_error = new_error_features(data.copy())
data_error.head()

class CustomTransformation():

    def __init__(self):
        """
        Simple class to convert categorical variables into numeric indices,
        usable for a machine learning algorithm
        """
        self.placefitted = False
        self.place2index = dict()
        self.placemax_index = 0

        self.magtypefitted = False
        self.magtype2index = dict()
        self.magtypemax_index = 0

        self.netfitted = False
        self.net2index = dict()
        self.netmax_index = 0

        self.typefitted = False
        self.type2index = dict()
        self.typemax_index = 0

        self.statusfitted = False
        self.status2index = dict()
        self.statusmax_index = 0

        self.locationsourcefitted = False
        self.locationsource2index = dict()
        self.locationsourcemax_index = 0

        self.magsourcefitted = False
        self.magsource2index = dict()
        self.magsourcemax_index = 0

    def fit_transform(self, dataset):

        self.placefitted = True
        self.magtypefitted = True
        self.netfitted = True
        self.typefitted = True
        self.statusfitted = True
        self.locationsourcefitted = True
        self.magsourcefitted = True

        places = dataset["Place"].unique()
        #print("Every place : ", places)
        self.place2index = {prop:i for (i, prop) in enumerate(places)}
        self.placemax_index = max(list(self.place2index.values()))

        magtypes = dataset["MagType"].unique()
        #print("Every magtype : ", magtypes)
        self.magtype2index = {prop:i for (i, prop) in enumerate(magtypes)}
        self.magtypemax_index = max(list(self.magtype2index.values()))

        nets = dataset["net"].unique()
        #print("Every net : ", nets)
        self.net2index = {prop:i for (i, prop) in enumerate(nets)}
        self.netmax_index = max(list(self.net2index.values()))

        types = dataset["Type"].unique()
        #print("Every type of earthquakes : ", types)
        self.type2index = {prop:i for (i, prop) in enumerate(types)}
        self.typemax_index = max(list(self.type2index.values()))

        status = dataset["status"].unique()
        #print("Every status : ", status)
        self.status2index = {prop:i for (i, prop) in enumerate(status)}
        self.statusmax_index = max(list(self.status2index.values()))

        locationsources = dataset["locationSource"].unique()
        #print("Every location source : ", locationsources)
        self.locationsource2index = {prop:i for (i, prop) in enumerate(locationsources)}
        self.locationsourcemax_index = max(list(self.locationsource2index.values()))

        magsources = dataset["magSource"].unique()
        #print("Every magsource : ", magsources)
        self.magsource2index = {prop:i for (i, prop) in enumerate(magsources)}
        self.magsourcemax_index = max(list(self.magsource2index.values()))

        return self.transform(dataset)

    def transform(self, dataset):

        dataset.loc[:, "Place"] = dataset["Place"].replace(self.place2index)
        dataset.loc[:, "MagType"] = dataset["MagType"].replace(self.magtype2index)
        dataset.loc[:, "net"] = dataset["net"].replace(self.net2index)
        dataset.loc[:, "Type"] = dataset["Type"].replace(self.type2index)
        dataset.loc[:, "status"] = dataset["status"].replace(self.status2index)
        dataset.loc[:, "locationSource"] = dataset["locationSource"].replace(self.locationsource2index)
        dataset.loc[:, "magSource"] = dataset["magSource"].replace(self.magsource2index)

        dataset[dataset.Place.isna()] = self.placemax_index + 1
        dataset[dataset.MagType.isna()] = self.magtypemax_index + 1
        dataset[dataset.net.isna()] = self.netmax_index + 1
        dataset[dataset.Type.isna()] = self.typemax_index + 1
        dataset[dataset.status.isna()] = self.statusmax_index + 1
        dataset[dataset.locationSource.isna()] = self.locationsourcemax_index + 1
        dataset[dataset.magSource.isna()] = self.magsourcemax_index + 1

        # convert the Time column into numeric data
        dataset['Time'] = pd.to_datetime(dataset['Time'], format='%Y-%m-%dT%H:%M:%S.%fZ')
        dataset['Time'] = dataset['Time'].dt.strftime('%Y%m%d').astype(int)

        # ensure the column is numeric
        dataset["Place"] = pd.to_numeric(dataset["Place"], errors="coerce")
        dataset["MagType"] = pd.to_numeric(dataset["MagType"], errors="coerce")
        dataset["net"] = pd.to_numeric(dataset["net"], errors="coerce")
        dataset["Type"] = pd.to_numeric(dataset["Type"], errors="coerce")
        dataset["status"] = pd.to_numeric(dataset["status"], errors="coerce")
        dataset["locationSource"] = pd.to_numeric(dataset["locationSource"], errors="coerce")
        dataset["magSource"] = pd.to_numeric(dataset["magSource"], errors="coerce")

        return dataset

# cell that regroups all the modification we've done on the dataset for stage 3 (~3 mins)
data = pd.read_csv("/content/sample_data/Significant Earthquake Dataset 1900-2023.csv", sep=',')

data_without_missing_values = handle_missing_values(data.copy()) # handling missing values

data_without_outliers = handle_outliers(data_without_missing_values.copy()) # handling outliers

data_time_features = new_time_features(data_without_outliers.copy()) # new time features

data_geo_features = new_geo_features(data_time_features.copy()) # new geological features

data_seism_features = new_seismicity_features(data_geo_features.copy()) # new seismicity features

data_error_features = new_error_features(data_seism_features.copy()) # new total error feature

features_transformer = CustomTransformation() # converting categorical data into numeric data
new_data = features_transformer.fit_transform(data_error_features.copy())

<ipython-input-201-f24cd1833a9b>:12: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_value, inplace=True)
<ipython-input-201-f24cd1833a9b>:19: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace=True)
<ipython-input-207-9204deb59d90>:85: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "Place"] = dataset["Place"].replace(self.place2index)
<ipython-input-207-9204deb59d90>:86: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "MagType"] = dataset["MagType"].replace(self.magtype2index)
<ipython-input-207-9204deb59d90>:87: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "net"] = dataset["net"].replace(self.net2index)
<ipython-input-207-9204deb59d90>:88: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "Type"] = dataset["Type"].replace(self.type2index)
<ipython-input-207-9204deb59d90>:89: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "status"] = dataset["status"].replace(self.status2index)
<ipython-input-207-9204deb59d90>:90: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "locationSource"] = dataset["locationSource"].replace(self.locationsource2index)
<ipython-input-207-9204deb59d90>:91: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, "magSource"] = dataset["magSource"].replace(self.magsource2index)

# display dataset after all the modifications
new_data.head()

# correlation matrix using seaborn heatmap
numeric_data = new_data.select_dtypes(include=['number'])

# we are computing the correlation matrix for numeric columns only
corr_matrix = numeric_data.corr()
plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# define the columns we want to keep
filtered_data = new_data[['Time', 'Place', 'Latitude', 'Longitude', 'Depth', 'MagType', 'nst', 'gap', 'net', 'depthError', 'magError', 'status', 'locationSource', 'magSource', 'totalError', 'Mag']]

filtered_data.head()

# summary of all the accuracies calculated in stage 1 and 2 for each machine learning algorithm
models = ['Random Forest Classifier', 'Decision Tree CLassifier', 'KNN Classifier', 'SVM Classifier', 'XGB Classifier', 'CastBoost Classifier', 'DNN Classifier', 'Gradient Boosting Classifier']
scores = [accuracy_best_rf, accuracy_dt, accuracy_knn, accuracy_svm, accuracy_xgb, accuracy_catboost, accuracy_dnn, accuracy_gb]

plt.figure(figsize=(18, 6))
bars = plt.bar(models, scores, color=['#87CEEB', '#DC143C', '#228B22', '#DAA520', '#9932CC', '#FF7F50', '#4682B4'])

for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{score:.2f}',
             ha='center', va='bottom', fontsize=12)

plt.xlabel('Models')
plt.ylabel('Performance (Accuracy)')
plt.title('Comparaison of machine learning model performance')
plt.ylim(0, 1)

plt.show()

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# define feature set (x) and target variable (y)
x = filtered_data.drop(columns=['Mag'])
y = filtered_data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)

# split data into training and testing sets (70% training, 30% testing)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# initialize random forest classifier with the best parameters we found in stage 2
rf = RandomForestClassifier(
    bootstrap=True,
    max_depth=10,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=200,
    random_state=42
)

# train the random forest model on the training data
rf.fit(x_train, y_train)

# make predictions on the test data
y_pred_rf = rf.predict(x_test)

# calculate accuracy of the model
new_accuracy_rf = accuracy_score(y_test, y_pred_rf)

# print results for the random forest classifier
print("Random forest classifier results")
print(f"Accuracy: {new_accuracy_rf:.2f}")
print("Classification report:")
print(classification_report(y_test, y_pred_rf))

Random forest classifier results
Accuracy: 0.74
Classification report:
              precision    recall  f1-score   support

           0       0.74      0.91      0.81      6981
           1       0.75      0.45      0.56      4131

    accuracy                           0.74     11112
   macro avg       0.74      0.68      0.69     11112
weighted avg       0.74      0.74      0.72     11112

import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = filtered_data.drop(columns=['Mag'])
y = filtered_data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # binary classification target: 1 for Mag >= 6.0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# initialize CatBoost (with the best parameters)
rf = RandomForestClassifier(
    bootstrap=True,
    max_depth=10,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=200,
    random_state=42
)

rf.fit(X_train, y_train)

y_pred_prob = rf.predict_proba(X_test)[:, 1]

# apply threshold moving
threshold = 0.4
y_pred_threshold = (y_pred_prob >= threshold).astype(int)

accuracy_rf_threshold = accuracy_score(y_test, y_pred_threshold)

print("RandomForest with threshold moving results")
print(f"Accuracy: {accuracy_rf_threshold:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_threshold))

RandomForest with threshold moving results
Accuracy: 0.74
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.81      0.79      6981
           1       0.66      0.61      0.63      4131

    accuracy                           0.74     11112
   macro avg       0.72      0.71      0.71     11112
weighted avg       0.73      0.74      0.73     11112

# cell that may take some time to execute (~2mins)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = filtered_data.drop(columns=['Mag'])  # Drop the 'Mag' column to use other features
y = filtered_data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Target: 1 for Mag >= 6.0, else 0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# initialize the base models
rf = RandomForestClassifier(
    bootstrap=True,
    max_depth=10,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=200,
    random_state=42
)

gb = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

stacked_model = StackingClassifier(
    estimators=[('rf', rf), ('gb', gb)],
    final_estimator=RandomForestClassifier(random_state=42)
)

stacked_model.fit(X_train, y_train)

y_pred_stacked = stacked_model.predict(X_test)

accuracy_rf_gb_stacked = accuracy_score(y_test, y_pred_stacked)

print("Stacked model results")
print(f"Accuracy: {accuracy_rf_gb_stacked:.2f}")
print("Classification report:")
print(classification_report(y_test, y_pred_stacked))

Stacked model results
Accuracy: 0.70
Classification report:
              precision    recall  f1-score   support

           0       0.74      0.81      0.77      6981
           1       0.62      0.51      0.56      4131

    accuracy                           0.70     11112
   macro avg       0.68      0.66      0.67     11112
weighted avg       0.69      0.70      0.70     11112

# may take some time to execute (~4mins)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

x = filtered_data.drop(columns=['Mag'])
y = filtered_data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)

# calculate the scale_pos_weight based on class imbalance
scale_pos_weight = len(y) / (2 * sum(y))

# define the parameter grid for Grid Search
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5],
    'model__learning_rate': [0.1, 0.2],
    'model__subsample': [0.8, 1.0],
}

# define the pipeline with the xgboost classifier
pipeline_xgb = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', XGBClassifier(random_state=42, scale_pos_weight=scale_pos_weight))  # xgboost model
])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# initialize Grid Search with cross-validation
grid_search = GridSearchCV(estimator=pipeline_xgb, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# fit Grid Search to the training data
grid_search.fit(x_train, y_train)

print("Best Parameters found: ", grid_search.best_params_)

# use the best parameters to train the final model
best_pipeline_xgb = grid_search.best_estimator_
best_pipeline_xgb.fit(x_train, y_train)

y_pred_best_xgb = best_pipeline_xgb.predict(x_test)

new_accuracy_best_xgb = accuracy_score(y_test, y_pred_best_xgb)

print("XGBoost classifier results with best parameters")
print(f"Accuracy: {new_accuracy_best_xgb:.2f}")
print("Classification report:")
print(classification_report(y_test, y_pred_best_xgb))

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Parameters found:  {'model__learning_rate': 0.2, 'model__max_depth': 5, 'model__n_estimators': 100, 'model__subsample': 1.0}
XGBoost classifier results with best parameters
Accuracy: 0.75
Classification report:
              precision    recall  f1-score   support

           0       0.78      0.84      0.81      6981
           1       0.69      0.61      0.64      4131

    accuracy                           0.75     11112
   macro avg       0.73      0.72      0.73     11112
weighted avg       0.75      0.75      0.75     11112

# cell that may take some time to execute (~3mins)

from sklearn.metrics import make_scorer, f1_score, recall_score, classification_report

# define a custom scorer for recall and f1-score (focus on class 1)
recall_scorer = make_scorer(recall_score, average=None, labels=[1])
f1_scorer = make_scorer(f1_score, average=None, labels=[1])

# set up a pipeline with standardization
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', XGBClassifier())
])

# parameter grid for optimization
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [4, 5],
    'model__learning_rate': [0.01, 0.1],
    'model__subsample': [0.8, 1.0]
}

grid_search_recall = GridSearchCV(
    pipeline, param_grid, scoring=recall_scorer, cv=5, verbose=2, n_jobs=-1
)
grid_search_f1 = GridSearchCV(
    pipeline, param_grid, scoring=f1_scorer, cv=5, verbose=2, n_jobs=-1
)

print("Optimization for recall (class 1):")
grid_search_recall.fit(X_train, y_train)

print("\nOptimization for f1-score (class 1):")
grid_search_f1.fit(X_train, y_train)

best_model_recall = grid_search_recall.best_estimator_
y_pred_recall = best_model_recall.predict(X_test)
accuracy_recall_xgb = accuracy_score(y_test, y_pred_recall)
print("\nBest model optimized for recall (class 1):")
print(f"Accuracy: {accuracy_recall_xgb:.2f}")
print(classification_report(y_test, y_pred_recall))

best_model_f1 = grid_search_f1.best_estimator_
y_pred_f1 = best_model_f1.predict(X_test)
accuracy_f1_xgb = accuracy_score(y_test, y_pred_f1)
print("\nBest model optimized for f1-score (class 1):")
print(f"Accuracy: {accuracy_f1_xgb:.2f}")
print(classification_report(y_test, y_pred_f1))

Optimization for recall (class 1):
Fitting 5 folds for each of 16 candidates, totalling 80 fits

Optimization for f1-score (class 1):
Fitting 5 folds for each of 16 candidates, totalling 80 fits

Best model optimized for recall (class 1):
Accuracy: 0.76
              precision    recall  f1-score   support

           0       0.76      0.89      0.82      6981
           1       0.74      0.53      0.62      4131

    accuracy                           0.76     11112
   macro avg       0.75      0.71      0.72     11112
weighted avg       0.75      0.76      0.75     11112


Best model optimized for f1-score (class 1):
Accuracy: 0.76
              precision    recall  f1-score   support

           0       0.76      0.89      0.82      6981
           1       0.74      0.53      0.62      4131

    accuracy                           0.76     11112
   macro avg       0.75      0.71      0.72     11112
weighted avg       0.75      0.76      0.75     11112

X = filtered_data.drop(columns=['Mag'])
y = filtered_data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Binary classification target: 1 for Mag >= 6.0

pipeline_xgb = Pipeline(steps=[
    ('scaler', StandardScaler()),  # Scaling the features
    ('model', XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.2, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pipeline_xgb.fit(X_train, y_train)

y_pred_prob_xgb = pipeline_xgb.predict_proba(X_test)[:, 1]  # probabilities for class 1

threshold = 0.4 # set the threshold to 0.4
y_pred_threshold_xgb = (y_pred_prob_xgb >= threshold).astype(int)  # apply threshold

accuracy_xgb_threshold = accuracy_score(y_test, y_pred_threshold_xgb)

print("XGBoost Classifier Results with Threshold Moving")
print(f"Accuracy: {accuracy_xgb_threshold}")
print("Classification Report:")
print(classification_report(y_test, y_pred_threshold_xgb))

XGBoost Classifier Results with Threshold Moving
Accuracy: 0.7458603311735061
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      6981
           1       0.66      0.65      0.66      4131

    accuracy                           0.75     11112
   macro avg       0.73      0.73      0.73     11112
weighted avg       0.75      0.75      0.75     11112

# cell that may take some time to execute (~5mins)
from sklearn.model_selection import GridSearchCV

X = filtered_data.drop(columns=['Mag'])
y = filtered_data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Binary classification target: 1 for Mag >= 6.0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

param_grid = {
    'learning_rate': [0.1, 0.2],
    'depth': [4, 6],
    'iterations': [100, 200],
    'l2_leaf_reg': [1, 3],
}

catboost = CatBoostClassifier(cat_features=[col for col in X.select_dtypes(include=['object']).columns], verbose=0)

# initialize Grid Search with cross-validation
grid_search = GridSearchCV(estimator=catboost, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# train Grid Search on the training data
grid_search.fit(X_train, y_train)

print("Best Parameters found: ", grid_search.best_params_)

# use the best parameters to train the final model
best_catboost = grid_search.best_estimator_
best_catboost.fit(X_train, y_train)

# make predictions and evaluate the model
y_pred_best_catboost = best_catboost.predict(X_test)
accuracy_best_cb = accuracy_score(y_test, y_pred_best_catboost)

print(f"Accuracy with Best Parameters: {accuracy_best_cb}")
print("Classification Report:")
print(classification_report(y_test, y_pred_best_catboost))

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Parameters found:  {'depth': 6, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.2}
Accuracy with Best Parameters: 0.7523398128149748
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.88      0.82      6981
           1       0.73      0.53      0.61      4131

    accuracy                           0.75     11112
   macro avg       0.75      0.71      0.72     11112
weighted avg       0.75      0.75      0.74     11112

# may take some time to execute (~4mins)

from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, f1_score, recall_score, classification_report

# define a custom scorer for recall and f1-score (focus on class 1)
recall_scorer = make_scorer(recall_score, average=None, labels=[1])
f1_scorer = make_scorer(f1_score, average=None, labels=[1])

# define the CatBoost model
catboost_model = CatBoostClassifier(depth=4, iterations=200, l2_leaf_reg=3, learning_rate=0.2, verbose=100)

# set up a pipeline with standardization
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', catboost_model)
])

# parameter grid for optimization
param_grid = {
    'model__iterations': [100, 200],
    'model__depth': [4, 5],
    'model__learning_rate': [0.01, 0.1],
    'model__l2_leaf_reg': [3, 5],
}

# grid search to optimize for recall
grid_search_recall = GridSearchCV(
    pipeline, param_grid, scoring=recall_scorer, cv=5, verbose=2, n_jobs=-1
)

# grid search to optimize for f1-score
grid_search_f1 = GridSearchCV(
    pipeline, param_grid, scoring=f1_scorer, cv=5, verbose=2, n_jobs=-1
)

# optimization for recall
print("Optimization for recall (class 1):")
grid_search_recall.fit(X_train, y_train)

# optimization for f1-score
print("\nOptimization for f1-score (class 1):")
grid_search_f1.fit(X_train, y_train)

# evaluate the best model optimized for recall
best_model_recall = grid_search_recall.best_estimator_
y_pred_recall = best_model_recall.predict(X_test)
print("\nBest model optimized for recall (class 1):")
print(classification_report(y_test, y_pred_recall))

# evaluate the best model optimized for f1-score
best_model_f1 = grid_search_f1.best_estimator_
y_pred_f1 = best_model_f1.predict(X_test)
print("\nBest model optimized for f1-score (class 1):")
print(classification_report(y_test, y_pred_f1))

Optimization for recall (class 1):
Fitting 5 folds for each of 16 candidates, totalling 80 fits
0:	learn: 0.6761000	total: 10.3ms	remaining: 2.05s
100:	learn: 0.5192778	total: 2.07s	remaining: 2.03s
199:	learn: 0.4909503	total: 4.51s	remaining: 0us

Optimization for f1-score (class 1):
Fitting 5 folds for each of 16 candidates, totalling 80 fits
0:	learn: 0.6761000	total: 12.6ms	remaining: 2.5s
100:	learn: 0.5192778	total: 1.85s	remaining: 1.81s
199:	learn: 0.4909503	total: 2.85s	remaining: 0us

Best model optimized for recall (class 1):
              precision    recall  f1-score   support

           0       0.76      0.90      0.82      6981
           1       0.75      0.51      0.60      4131

    accuracy                           0.75     11112
   macro avg       0.75      0.70      0.71     11112
weighted avg       0.75      0.75      0.74     11112


Best model optimized for f1-score (class 1):
              precision    recall  f1-score   support

           0       0.76      0.90      0.82      6981
           1       0.75      0.51      0.60      4131

    accuracy                           0.75     11112
   macro avg       0.75      0.70      0.71     11112
weighted avg       0.75      0.75      0.74     11112

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = filtered_data.drop(columns=['Mag'])
y = filtered_data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # binary classification target: 1 for Mag >= 6.0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# initialize CatBoost (with the best parameters)
catboost_model = CatBoostClassifier(
    depth=6,
    iterations=200,
    learning_rate=0.2,
    l2_leaf_reg=3,
    verbose=100
)

catboost_model.fit(X_train, y_train)

y_pred_prob = catboost_model.predict_proba(X_test)[:, 1]

# apply threshold moving
threshold = 0.4
y_pred_threshold = (y_pred_prob >= threshold).astype(int)

accuracy_cb_threshold = accuracy_score(y_test, y_pred_threshold)

print("CatBoost with threshold moving results")
print(f"Accuracy: {accuracy_cb_threshold}")
print("Classification Report:")
print(classification_report(y_test, y_pred_threshold))

0:	learn: 0.6597662	total: 11.3ms	remaining: 2.25s
100:	learn: 0.4773821	total: 1.18s	remaining: 1.15s
199:	learn: 0.4388250	total: 2.26s	remaining: 0us
CatBoost with threshold moving results
Accuracy: 0.7481101511879049
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      6981
           1       0.66      0.66      0.66      4131

    accuracy                           0.75     11112
   macro avg       0.73      0.73      0.73     11112
weighted avg       0.75      0.75      0.75     11112

from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = filtered_data.drop(columns=['Mag'])
y = filtered_data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # Binary classification target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# initialize CatBoost and RandomForest
catboost_model = CatBoostClassifier(
    depth=6,
    iterations=200,
    learning_rate=0.2,
    l2_leaf_reg=3,
    verbose=100
)

random_forest_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

# initialize the voting classifier with soft voting (probabilities are averaged)
voting_clf = VotingClassifier(
    estimators=[('catboost', catboost_model), ('random_forest', random_forest_model)],
    voting='soft'
)

# fit the voting classifier
voting_clf.fit(X_train, y_train)

y_pred_prob = voting_clf.predict_proba(X_test)[:, 1]

# move the decision threshold to 0.4 for Class 1
threshold = 0.4
y_pred_threshold = (y_pred_prob >= threshold).astype(int)

accuracy_cb_rf_threshold = accuracy_score(y_test, y_pred_threshold)

print("CatBoost + RandomForest + Threshold results")
print(f"Accuracy: {accuracy_cb_rf_threshold}")
print("Classification Report:")
print(classification_report(y_test, y_pred_threshold))

0:	learn: 0.6597662	total: 12.6ms	remaining: 2.51s
100:	learn: 0.4773821	total: 1.2s	remaining: 1.18s
199:	learn: 0.4388250	total: 2.37s	remaining: 0us
CatBoost + RandomForest + Threshold results
Accuracy: 0.7477501799856011
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.79      0.80      6981
           1       0.66      0.67      0.66      4131

    accuracy                           0.75     11112
   macro avg       0.73      0.73      0.73     11112
weighted avg       0.75      0.75      0.75     11112

# let's try stacking the four methods together
# cell that may take some time to execute (~2mins)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# split the data into features (X) and target (y)
X = filtered_data.drop(columns=['Mag'])  # drop the 'Mag' column to use other features
y = filtered_data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # binary classification: 1 for Mag >= 6.0, else 0

# separate numerical and categorical features
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns  # select numeric columns
categorical_features = X.select_dtypes(include=['object']).columns  # select categorical columns

# calculate the scale_pos_weight based on class imbalance
scale_pos_weight = len(y) / (2 * sum(y))

# define a pipeline for transforming numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # standardize numerical features
])

# define a pipeline for transforming categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # one-hot encode categorical variables, ignore unknowns
])

# combine the numerical and categorical transformations into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),  # apply numerical transformer
        ('cat', categorical_transformer, categorical_features)  # apply categorical transformer
    ]
)

# define the base models
rf_model = RandomForestClassifier(bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200, random_state=42)
xgb_model = XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.8)
catboost_model = CatBoostClassifier(depth=4, iterations=200, l2_leaf_reg=3, learning_rate=0.2, verbose=100)
gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)

# define the stacking classifier
stacked_model = StackingClassifier(
    estimators=[
        ('catboost', catboost_model),
        ('rf', rf_model),
        ('xgb', xgb_model),
        ('gb', gb_model)
    ],
    final_estimator=CatBoostClassifier(iterations=200, learning_rate=0.1, depth=6, verbose=0)
)

# define the pipeline with the stacking classifier
pipeline_stacked = Pipeline(steps=[
    ('preprocessor', preprocessor),  # apply the preprocessor to X
    ('model', stacked_model)  # stacking model
])

# split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# fit the pipeline to the training data
pipeline_stacked.fit(X_train, y_train)

# make predictions on the test data
y_pred_stacked = pipeline_stacked.predict(X_test)

# calculate accuracy of the stacked model
accuracy_stacked = accuracy_score(y_test, y_pred_stacked)

# print results for the stacked model
print("Stacked Model Results")
print(f"Accuracy: {accuracy_stacked}")
print("Classification Report:")
print(classification_report(y_test, y_pred_stacked))

0:	learn: 0.6625878	total: 7.08ms	remaining: 1.41s
100:	learn: 0.5046214	total: 806ms	remaining: 790ms
199:	learn: 0.4799704	total: 1.59s	remaining: 0us
0:	learn: 0.6617258	total: 6.85ms	remaining: 1.36s
100:	learn: 0.5041383	total: 825ms	remaining: 808ms
199:	learn: 0.4768974	total: 1.54s	remaining: 0us
0:	learn: 0.6613167	total: 6.95ms	remaining: 1.38s
100:	learn: 0.5019893	total: 727ms	remaining: 712ms
199:	learn: 0.4739053	total: 1.41s	remaining: 0us
0:	learn: 0.6636367	total: 6.43ms	remaining: 1.28s
100:	learn: 0.4993857	total: 909ms	remaining: 891ms
199:	learn: 0.4716282	total: 2.72s	remaining: 0us
0:	learn: 0.6638223	total: 18.3ms	remaining: 3.64s
100:	learn: 0.5040038	total: 1.83s	remaining: 1.79s
199:	learn: 0.4770305	total: 2.54s	remaining: 0us
0:	learn: 0.6627518	total: 6.18ms	remaining: 1.23s
100:	learn: 0.5027857	total: 731ms	remaining: 716ms
199:	learn: 0.4741388	total: 1.39s	remaining: 0us
Stacked Model Results
Accuracy: 0.7568394528437725
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.88      0.82      6981
           1       0.73      0.55      0.63      4131

    accuracy                           0.76     11112
   macro avg       0.75      0.71      0.72     11112
weighted avg       0.75      0.76      0.75     11112

# overview of the kaggle notebook with the correct fit and predict method

# import necessary libraries for regression and evaluation
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# select features (x) and target variable (y)
x = data[["Latitude", "Longitude", "Depth", "nst", "gap", "rms", "magError", "horizontalError"]]  # features
y = data["Mag"]  # target: earthquake magnitude

# split the data into training and testing sets (80% training, 20% testing)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# standardize the features using standard scaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)  # fit the scaler to training data and transform
x_test = scaler.transform(x_test)  # transform the testing data

# initialize decision tree regressor
dt_regressor = DecisionTreeRegressor(random_state=42)

# train the decision tree regressor on the training data
dt_regressor.fit(x_train, y_train)

# predict target values for the testing set
y_pred = dt_regressor.predict(x_test)

# evaluate the model using r2 score, root mean squared error (rmse), and mean absolute error (mae)
r2 = r2_score(y_test, y_pred)  # r2 score measures the goodness of fit
rmse = mean_squared_error(y_test, y_pred, squared=False)  # calculate root mean squared error
mae = mean_absolute_error(y_test, y_pred)  # calculate mean absolute error

# print the evaluation metrics
print(f"r2 score : {r2}")
print(f"root mean squared error (rmse): {rmse}")
print(f"mean absolute error (mae): {mae}")

r2 score : -0.5279563571006627
root mean squared error (rmse): 0.5677980300419442
mean absolute error (mae): 0.39543993571715547

/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(

import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping

X = filtered_data.drop(columns=['Mag'])
y = filtered_data['Mag'].apply(lambda x: 1 if x >= 6.0 else 0)  # binary classification target: 1 for Mag >= 6.0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = models.Sequential([
    layers.InputLayer(shape=(X_train.shape[1],)),  # input layer
    layers.Dense(128, activation='relu'),               # hidden layer 1
    layers.BatchNormalization(),                        # batch normalization
    layers.Dropout(0.2),                                # dropout to prevent overfitting
    layers.Dense(64, activation='relu'),                # hidden layer 2
    layers.BatchNormalization(),                        # batch normalization
    layers.Dropout(0.2),                                # dropout
    layers.Dense(32, activation='relu'),                # hidden layer 3
    layers.BatchNormalization(),                        # batch normalization
    layers.Dropout(0.2),                                # dropout
    layers.Dense(1, activation='sigmoid')               # output layer (sigmoid for binary classification)
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy}')

Epoch 1/10
926/926 ━━━━━━━━━━━━━━━━━━━━ 8s 5ms/step - accuracy: 0.6292 - loss: 0.6691 - val_accuracy: 0.6915 - val_loss: 0.5898
Epoch 2/10
926/926 ━━━━━━━━━━━━━━━━━━━━ 4s 4ms/step - accuracy: 0.6839 - loss: 0.5985 - val_accuracy: 0.6969 - val_loss: 0.5804
Epoch 3/10
926/926 ━━━━━━━━━━━━━━━━━━━━ 6s 5ms/step - accuracy: 0.6932 - loss: 0.5858 - val_accuracy: 0.7063 - val_loss: 0.5745
Epoch 4/10
926/926 ━━━━━━━━━━━━━━━━━━━━ 4s 4ms/step - accuracy: 0.6977 - loss: 0.5815 - val_accuracy: 0.7133 - val_loss: 0.5651
Epoch 5/10
926/926 ━━━━━━━━━━━━━━━━━━━━ 4s 4ms/step - accuracy: 0.7066 - loss: 0.5723 - val_accuracy: 0.7156 - val_loss: 0.5624
Epoch 6/10
926/926 ━━━━━━━━━━━━━━━━━━━━ 6s 5ms/step - accuracy: 0.7101 - loss: 0.5659 - val_accuracy: 0.7154 - val_loss: 0.5619
Epoch 7/10
926/926 ━━━━━━━━━━━━━━━━━━━━ 4s 4ms/step - accuracy: 0.7101 - loss: 0.5681 - val_accuracy: 0.7219 - val_loss: 0.5573
Epoch 8/10
926/926 ━━━━━━━━━━━━━━━━━━━━ 4s 4ms/step - accuracy: 0.7139 - loss: 0.5629 - val_accuracy: 0.7245 - val_loss: 0.5535
Epoch 9/10
926/926 ━━━━━━━━━━━━━━━━━━━━ 6s 5ms/step - accuracy: 0.7200 - loss: 0.5585 - val_accuracy: 0.7173 - val_loss: 0.5607
Epoch 10/10
926/926 ━━━━━━━━━━━━━━━━━━━━ 4s 4ms/step - accuracy: 0.7239 - loss: 0.5547 - val_accuracy: 0.7211 - val_loss: 0.5539
232/232 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.7153 - loss: 0.5618
Accuracy: 0.7211123108863831

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# define the model
model = Sequential()
model.add(Dense(64, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

# train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping, reduce_lr], verbose=10)

# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy}")

Epoch 1/50

/usr/local/lib/python3.10/dist-packages/keras/src/layers/core/dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
232/232 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.7221 - loss: 0.5680
Accuracy: 0.7312365174293518

# plot that shows the improvement of the accuracies from stage 1/2 to stage 3
algorithms = ['RandomForest', 'XGBoost', 'CatBoost']
version_1_accuracies = [accuracy_best_rf, accuracy_xgb, accuracy_catboost]
version_2_accuracies = [new_accuracy_rf, accuracy_f1_xgb, accuracy_best_cb]

# create a bar plot
x = np.arange(len(algorithms))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(15, 4))

# plotting the bars for version 1 and version 2
rects1 = ax.bar(x - width/2, version_1_accuracies, width, label='Version 1', color= '#DC143C')
rects2 = ax.bar(x + width/2, version_2_accuracies, width, label='Version 2', color= '#DAA520')

ax.set_xlabel('Algorithms')
ax.set_ylabel('Accuracy')
ax.set_title('Accuracy comparison between stage 1/2 and stage 3')
ax.set_xticks(x)
ax.set_xticklabels(algorithms)
ax.legend()

for rect in rects1:
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2., height, f'{height:.2f}', ha='center', va='bottom', fontsize=10)

for rect in rects2:
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2., height, f'{height:.2f}', ha='center', va='bottom', fontsize=10)

plt.show()

# summary of all the accuracies calculated in stage 3
models = ['RF', 'RF + Threshold Moving', 'RF + Gradient Boosting ', 'XGB', 'XGB + Grid Search', 'XGB + Threshold', 'CB + Grid Search',  'CB + Threshold',  'CB + RF + Threshold', 'Stacked Model']
scores = [new_accuracy_rf, accuracy_rf_threshold, accuracy_rf_gb_stacked, new_accuracy_best_xgb, accuracy_f1_xgb, accuracy_xgb_threshold, accuracy_best_cb, accuracy_cb_threshold, accuracy_cb_rf_threshold, accuracy_stacked]

plt.figure(figsize=(20, 5))
bars = plt.bar(models, scores, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'])

# add values on top of the bars
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{score:.2f}',
             ha='center', va='bottom', fontsize=12)

plt.xlabel('Models')
plt.ylabel('Performance (Accuracy)')
plt.title('Comparaison of machine learning model performance')
plt.ylim(0, 1)

plt.show()

updates = ['RF', 'RF + Threshold Moving', 'RF + Gradient Boosting ', 'XGB', 'XGB + Grid Search', 'XGB + Threshold', 'CB + Grid Search',  'CB + Threshold',  'CB + RF + Threshold', 'Stacked Model']
accuracies = [new_accuracy_rf, accuracy_rf_threshold, accuracy_rf_gb_stacked, new_accuracy_best_xgb, accuracy_f1_xgb, accuracy_xgb_threshold, accuracy_best_cb, accuracy_cb_threshold, accuracy_cb_rf_threshold, accuracy_stacked]
precision_values = [0.75, 0.66, 0.62, 0.69, 0.74, 0.66, 0.73, 0.66, 0.66, 0.73]
recall_values = [0.46, 0.61, 0.51, 0.61, 0.53, 0.65, 0.53, 0.66, 0.67, 0.55]
f1_values = [0.56, 0.63, 0.56, 0.64, 0.62, 0.66, 0.61, 0.66, 0.66, 0.63]

# plot the accuracies over the update methods
plt.figure(figsize=(12, 6))

plt.plot(updates, accuracies, label='Accuracy', marker='o', linestyle='-', color='green')
plt.plot(updates, precision_values, label='Precision (Class 1)', marker='o', linestyle='-', color='blue')
plt.plot(updates, recall_values, label='Recall (Class 1)', marker='o', linestyle='-', color='red')
plt.plot(updates, f1_values, label='F1-score (Class 1)', marker='o', linestyle='-', color='purple')

plt.xlabel('Update Methods')
plt.ylabel('Metric Values')
plt.title('Evolution of accuracy and metrics for class 1')

plt.legend()
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

	Latitude	Longitude	Depth	Mag	rms	depthError
count	37331.000000	37331.000000	37331.000000	37331.000000	37331.000000	37331.000000
mean	5.457651	38.877695	58.475362	5.948616	1.000422	8.654680
std	30.789822	123.090934	109.381376	0.455160	0.262592	8.280941
min	-77.080000	-179.997000	-4.000000	5.500000	0.005000	0.000000
25%	-16.519800	-75.807000	15.000000	5.600000	1.000000	5.200000
50%	1.153000	98.577000	28.500000	5.800000	1.000000	6.100000
75%	33.786000	143.347850	41.000000	6.140000	1.000000	7.300000
max	87.199000	180.000000	700.000000	9.500000	42.410000	569.200000

	Latitude	Longitude	Depth	Mag	rms	depthError	Year
count	37331.000000	37331.000000	37331.000000	37331.000000	37331.000000	37331.000000	37331.000000
mean	5.457651	38.877695	58.475362	5.948616	1.000422	8.654680	1979.748118
std	30.789822	123.090934	109.381376	0.455160	0.262592	8.280941	29.096380
min	-77.080000	-179.997000	-4.000000	5.500000	0.005000	0.000000	1900.000000
25%	-16.519800	-75.807000	15.000000	5.600000	1.000000	5.200000	1961.000000
50%	1.153000	98.577000	28.500000	5.800000	1.000000	6.100000	1985.000000
75%	33.786000	143.347850	41.000000	6.140000	1.000000	7.300000	2004.000000
max	87.199000	180.000000	700.000000	9.500000	42.410000	569.200000	2023.000000

	Depth	Mag	magError
count	37197.000000	37331.000000	16551.000000
mean	58.583346	5.948616	0.261882
std	109.563400	0.455160	0.169566
min	-4.000000	5.500000	0.000000
25%	15.000000	5.600000	0.200000
50%	28.500000	5.800000	0.200000
75%	41.000000	6.140000	0.330000
max	700.000000	9.500000	1.840000

	Depth	Mag	magError
count	37189.000000	37189.000000	16549.000000
mean	58.596290	5.948595	0.261911
std	109.571627	0.454672	0.169556
min	0.000000	5.500000	0.000000
25%	15.000000	5.600000	0.200000
50%	28.500000	5.800000	0.200000
75%	41.000000	6.140000	0.330000
max	700.000000	9.500000	1.840000

	Time	Place	Latitude	Longitude	Depth	Mag	MagType	nst	gap	dmin	...	magSource	Hour	Day	Month	Season	RegionCluster	nstxgap	nstxdmin	totalError
36755	19141026	0	45.080	7.404	15.0	5.50	0	242.0	36.0	2.497	...	0	3	26	10	3	0	8712.0	604.274	15.17
15707	19910506	1	10.353	125.253	32.1	5.50	1	242.0	36.0	2.497	...	1	0	6	5	1	2	8712.0	604.274	13.40
36127	19210420	2	35.200	32.997	15.0	5.51	0	242.0	36.0	2.497	...	0	16	20	4	1	0	8712.0	604.274	14.50
13660	19950629	3	-19.544	169.287	139.4	6.60	0	242.0	36.0	2.497	...	2	12	29	6	2	2	8712.0	604.274	13.40
16152	19900614	4	11.760	121.899	18.1	7.10	0	242.0	36.0	2.497	...	2	7	14	6	2	2	8712.0	604.274	13.40

	Time	Place	Latitude	Longitude	Depth	Mag	MagType	nst	gap	dmin	...	Updated	Unnamed: 14	Type	horizontalError	depthError	magError	magNst	status	locationSource	magSource
0	2023-02-17T09:37:34.868Z	130 km SW of Tual, Indonesia	-6.5986	132.0763	38.615	6.1	mww	119.0	51.0	2.988	...	2023-02-17T17:58:24.040Z	NaN	earthquake	6.41	5.595	0.065	23.0	reviewed	us	us
1	2023-02-16T05:37:05.138Z	7 km SW of Port-Olry, Vanuatu	-15.0912	167.0294	36.029	5.6	mww	81.0	26.0	0.392	...	2023-02-17T05:41:32.448Z	NaN	earthquake	5.99	6.080	0.073	18.0	reviewed	us	us
2	2023-02-15T18:10:10.060Z	Masbate region, Philippines	12.3238	123.8662	20.088	6.1	mww	148.0	47.0	5.487	...	2023-02-16T20:12:32.595Z	NaN	earthquake	8.61	4.399	0.037	71.0	reviewed	us	us
3	2023-02-15T06:38:09.034Z	54 km WNW of Otaki, New Zealand	-40.5465	174.5709	74.320	5.7	mww	81.0	40.0	0.768	...	2023-02-16T06:42:09.738Z	NaN	earthquake	3.68	4.922	0.065	23.0	reviewed	us	us
4	2023-02-14T13:16:51.072Z	2 km NW of Lele?ti, Romania	45.1126	23.1781	10.000	5.6	mww	132.0	28.0	1.197	...	2023-02-17T09:15:18.586Z	NaN	earthquake	4.85	1.794	0.032	95.0	reviewed	us	us

	0
Time	object
Place	object
Latitude	float64
Longitude	float64
Depth	float64
Mag	float64
MagType	object
nst	float64
gap	float64
dmin	float64
rms	float64
net	object
ID	object
Updated	object
Unnamed: 14	float64
Type	object
horizontalError	float64
depthError	float64
magError	float64
magNst	float64
status	object
locationSource	object
magSource	object

	Time	Place	MagType	net	ID	Updated	Type	status	locationSource	magSource
count	37331	37331	37331	37331	37331	37331	37331	37331	37331	37331
unique	37331	25801	18	16	37331	36991	4	2	54	54
top	2023-02-17T09:37:34.868Z	South Sandwich Islands region	mw	us	us6000jpl7	2018-06-04T20:43:44.000Z	earthquake	reviewed	us	us
freq	1	664	18700	23364	1	143	37080	37317	23069	13264

	Time	Place	Latitude	Longitude	Depth	Mag	rms	depthError
0	20230217	0	-6.5986	132.0763	38.615	6.1	0.76	5.595
1	20230216	1	-15.0912	167.0294	36.029	5.6	0.94	6.080
2	20230215	2	12.3238	123.8662	20.088	6.1	0.54	4.399
3	20230215	3	-40.5465	174.5709	74.320	5.7	1.15	4.922
4	20230214	4	45.1126	23.1781	10.000	5.6	0.40	1.794

	0
Time	int64
Place	object
Latitude	float64
Longitude	float64
Depth	float64
Mag	float64
MagType	object
rms	float64
net	object
Type	object
depthError	float64
status	object
locationSource	object
magSource	object
Year	int32

	Time	Place	Latitude	Longitude	Depth	Mag	MagType	nst	gap	dmin	...	magError	magNst	status	locationSource	magSource	Hour	Day	Month
90	2022-12-05 18:16:12.113000+00:00	NaN	-26.6668	-114.0671	10.00	5.5	mww	215.0	57.0	4.251	...	0.103	9.0	reviewed	us	us	18	5	12
87	2022-12-06 09:34:51.134000+00:00	NaN	-40.8109	78.5642	10.00	5.5	mww	33.0	60.0	3.108	...	0.071	19.0	reviewed	us	us	9	6	12
73	2022-12-15 04:03:15.817000+00:00	NaN	23.7695	121.8115	13.00	5.9	mww	129.0	36.0	0.449	...	0.043	52.0	reviewed	us	us	4	15	12
21	2023-02-06 01:26:50.760000+00:00	NaN	37.2241	36.9749	10.00	5.7	mb	97.0	75.0	0.194	...	0.072	69.0	reviewed	us	us	1	6	2
7	2023-02-11 08:55:07.845000+00:00	NaN	3.6213	126.6944	48.78	5.9	mww	130.0	37.0	2.909	...	0.054	33.0	reviewed	us	us	8	11	2

Earthquake Prediction¶

Phase 1: data Collection and Preparation¶

Step 1: Retrieve data from the Significant Earthquake dataset (1900-2023)¶

Step 2: Visualization¶

Step 3: Analyze Data Quality (Missing Values, Duplicates, Statistical Information, etc.)¶

Step 4: Convert categorical data into numeric data¶

Step 5: Define Key Variables for Analysis and Address Data Imbalance Issues¶

Step 6: Perform Correlation Analysis and Dimensionality Reduction¶

Step 7: Split the data¶

Step 8: Model Implementation and Evaluation¶

Linear Regression¶

Linear Regression (Classification)¶

Logistic Regression (Classification)¶

Random Forest Classifier¶

Decision Tree Classifier¶

K-Nearest Neighbors (KNN)¶

Support Vector Machine (SVM)¶

Phase 2: Improving the standard solution¶

Step 1: Hyperparameter Tuning¶

Step 2: Feature Engineering¶

Step 3: Handle Class Imbalance¶

Step 4: Advanced Algorithms¶

XGBoost Classifier¶

LightGBM Regression¶

CatBoost Classifier¶

Deep Neural Networks (DNN)¶

Gradient Boosting Classifier¶

Phase 3: Improving more and more¶

Step 1: Data preprocessing¶

Handle outliers¶

Handle missing values¶

Create new features¶

Convert categorical columns¶

Regrouping all the modifications¶

Step 2: Define key variables¶

Step 3: Determine the best classification models¶

Step 4: Model Improvement¶

Random Forest Classifier¶

RF + Threshold Moving¶

RF + Gradient Boosting¶

XGBoost Classifier¶

XGBoost + Grid Search (recall and f1-score)¶

XGBoost + Threshold Moving¶

CatBoost Classifier¶

CB + Grid Search¶

CB + Threshold Moving¶

CB + RF + Threshold Moving¶

CB + RF + XGB + GB¶

Clarification on the Kaggle notebook mentionned in the pre-project¶

Step 5: Deep learning algorithm (Neural Network built using TensorFlow and Keras)¶

Step 6: Visualize results¶

Step 7: Conclusion¶