Can we Create a Model to Predict Canadian Rent Prices for 2025 Based on Rent and Immigration Data? (Capstone Project)

# Python code used for this research #

##############################################################
## SECTION 1: Import, Clean-Up, and Preparation of the Data ##
##############################################################

## Import data ##

from google.colab import files
uploaded = files.upload()
# steps:
#1.Download two files to the local:  'synthetic_rental_prices_canada_updated.csv' and 'PR_Admissions_unpivoted_added_2025.xlsx'
#2.upload the files to the files under this colab notebook

# read csv file with rent by city
import pandas as pd
csv_file = 'synthetic_rental_prices_canada_updated.csv'
df_rental = pd.read_csv(csv_file)
df_rental.tail()

# read xlsx file for the pr data with city and date
xlsx_file = 'PR_Admissions_unpivoted_added_2025.xlsx'
df_pr= pd.read_excel(xlsx_file, dtype = {'Provice':str, 'City': str,'Date':str, 'Admissions':int, 'Month': str, 'Year': int})
df_pr.tail()

#date column is missing values but is redundant so will be dropped

df_pr.info()

## Check whether the values of province or city are matching between two dataframes ##

# Extract unique values
unique_cities_df_rental = set(df_rental['City'].dropna().unique())
unique_cities_df_pr = set(df_pr['City'].dropna().unique())

# Compare
matching_cities = unique_cities_df_rental.intersection(unique_cities_df_pr)
only_in_df_rental = unique_cities_df_rental - unique_cities_df_pr
only_in_df_pr = unique_cities_df_pr - unique_cities_df_rental

print("Matching Cities:", matching_cities)
print("Cities only in df_rental:", only_in_df_rental)
print("Cities only in df_pr:", only_in_df_pr)

# Cities in df_pr with {'Montreal', 'Ottawa'}, in df_pr {'Montréal','Ottawa - Gatineau (Ontario part)', 'Ottawa - Gatineau (Quebec part)'}
# Next: df_pr, rename 'Montréal' -> 'Montreal',
# combine 'Ottawa - Gatineau (Quebec part)' and 'Ottawa - Gatineau (Ontario part)', then rename to 'Ottawa',
# drop all other cities not in the 7 in th df_rental


df_pr['City'] = df_pr['City'].replace({
    'Montréal': 'Montreal',
    'Ottawa - Gatineau (Ontario part)': 'Ottawa',
    'Ottawa - Gatineau (Quebec part)':'Ottawa'
})  # rename the values in the 'City' column

target_cities = ['Vancouver', 'Edmonton', 'Toronto', 'Calgary', 'Halifax','Montreal', 'Ottawa']

# Drop rows where 'City' is not in the list
df_pr_clean = df_pr[df_pr['City'].isin(target_cities)].reset_index(drop =1)
df_pr_clean.City.unique()

df_pr_clean.tail()

df_pr_agg = df_pr_clean.groupby(['Province', 'City',  'Month', 'Year'], as_index=False)['Admissions'].sum()
df_pr_agg.tail()

month_mapping = {
    'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
    'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
    'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
}

# Map the Month column
df_pr_agg['Month'] = df_pr_agg['Month'].map(month_mapping)
df_pr_agg['Month'] = pd.to_numeric(df_pr_agg['Month'])
df_pr_agg.tail()

## Data Mining: Explore the impact columns to the rent##

df_explore = df_rental.copy()
df_rental.dtypes

df_explore.columns = df_explore.columns.str.replace(' ', '', regex=False) #remove space in the column name
df_explore.dtypes

dep_var = ['MonthlyRent']

categorical_vars = ['City', 'RentalType', 'Furnished','PetFriendly','ParkingIncluded','InternetAvailability']

discrete_vars = ['Year', 'Month', 'Bedrooms', 'SquareFootage', 'WalkScore',
                 'TransitScore', 'BuildingAge', 'LeaseTerm']

continuous_vars = ['DistancetoCityCenter(km)', 'EnergyEfficiencyRating', 'NoiseLevel','NearbySchoolsRating','CrimeRateIndex','AnnualPropertyTax']

df_explore.loc[:, dep_var + discrete_vars + continuous_vars].describe()

import seaborn as sns
sns.displot(df_explore['MonthlyRent'])

import matplotlib.pyplot as plt
df_explore.loc[:, discrete_vars + continuous_vars].hist(bins=50, figsize=(20,15))
plt.show()

corr_matrix = df_explore.loc[:, ['MonthlyRent'] + continuous_vars].corr(method='pearson')
corr_matrix['MonthlyRent'].sort_values(ascending=False)

corr_matrix = df_explore.loc[:, ['MonthlyRent'] + discrete_vars].corr(method='pearson')
corr_matrix['MonthlyRent'].sort_values(ascending=False)

sns.pairplot(data=df_explore.loc[:, ['MonthlyRent'] + continuous_vars])
plt.show()

sns.pairplot(data=df_explore.loc[:, ['MonthlyRent'] + discrete_vars])
plt.show()

for item in [x for x in categorical_vars ]:
    plt.figure(figsize=(10,5))
    sns.boxplot(x=item, y='MonthlyRent', data=df_explore)
    if len(df_explore[item].unique()) > 4:
        plt.xticks(rotation=45)
    plt.show()
    
df_rent_clean = df_explore[['City', 'RentalType', 'Year', 'Month', 'Bedrooms', 'SquareFootage','AnnualPropertyTax','MonthlyRent']]
df_rent_clean.tail()

## Join the rent and pr dataframes ##

df_combine = pd.merge(df_rent_clean, df_pr_agg, on=['City', 'Year', 'Month'], how='left')
df_combine.drop(['Province'], inplace=True, axis=1) #drop non-important columns from df_pr_agg
df_combine

#convert admissions to float
#df_combine['Admissions'] = df_combine['Admissions'].str.replace(',', '').astype(float)
#df_combine = df_combine[df_combine['Admissions'].notna()]
#df_combine.tail()

## Prepare 2025 rental prices data for prediction ##

df_rent_clean['Year'].unique()

df_rent_clean_2024 = df_rent_clean.copy()
df_rent_clean_2024 = df_rent_clean_2024.loc[df_rent_clean_2024['Year'] == 2023, :]
df_rent_clean_2024['AdmissionsYear'] = 2024 

df_rent_clean_2025 = df_rent_clean.copy()
df_rent_clean_2025 = df_rent_clean_2025.loc[df_rent_clean_2025['Year'] == 2023, :]
df_rent_clean_2025['AdmissionsYear'] = 2025 
df_rent_clean_2025.head() 

#rename df_pr_agg['Year'] to 'AdmissionsYear' so join works
df_pr_agg2025=df_pr_agg.rename(columns={'Year': 'AdmissionsYear'})
df_pr_agg2025.tail()

#Join 2024 Rental Listings with 2025 PR Admissions to predict Monthly Rent in 2025
df_combine_2024 = df_rent_clean_2024.merge(df_pr_agg2025, on=['City', 'AdmissionsYear', 'Month'], how='left')
df_combine_2024.drop(['Province'], inplace=True, axis=1) #drop non-important columns from df_pr_agg
df_combine_2024 = df_combine_2024[df_combine_2024['Admissions'].notna()]

df_combine_2025 = df_rent_clean_2025.merge(df_pr_agg2025, on=['City', 'AdmissionsYear', 'Month'], how='left')
df_combine_2025.drop(['Province'], inplace=True, axis=1) #drop non-important columns from df_pr_agg
df_combine_2025 = df_combine_2025[df_combine_2025['Admissions'].notna()] #PR admissions data only present until Sept of 2024. Drop records past sept 2025
df_combine_2025 

df_combine_2425 = pd.concat([df_combine_2024, df_combine_2025]) 
df_combine_2425.head()

###############################
## SECTION 2: Pre-Processing ##
###############################

## Create Pre-processing pipeline for categorical encoding and standardized scaling ##

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

dep_var = ['MonthlyRent']

categorical_vars_comb = ['City', 'RentalType']

discrete_vars_comb = ['Year', 'Month', 'Bedrooms', 'SquareFootage','Admissions']

continuous_vars_comb = ['AnnualPropertyTax']

# Categorical - one hot encode
cat_ohe_step = ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
cat_steps = [cat_ohe_step]
cat_pipe = Pipeline(cat_steps)
cat_transformers = [('cat', cat_pipe, categorical_vars_comb)]

# Numerical -  scale
num_scl_step = ('scl', StandardScaler())
num_steps = [num_scl_step]
num_pipe = Pipeline(num_steps)
num_transformers = [('num', num_pipe, discrete_vars_comb + continuous_vars_comb)]

ct = ColumnTransformer(transformers=cat_transformers + num_transformers)

ct.fit(df_combine[categorical_vars_comb + discrete_vars_comb + continuous_vars_comb])
X=ct.transform(df_combine[categorical_vars_comb + discrete_vars_comb + continuous_vars_comb])
y=df_combine[['MonthlyRent']].values  

X

## Train-test split ##

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

################################
## SECTION 3: Model Selection ##
################################

## Regression Models ##


#import pandas as pd
#from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define target (y) and features (X)
y = df_combine['MonthlyRent']  # Target variable

# Features to include
X = df_combine[['City', 'RentalType', 'Year', 'Month', 'Bedrooms', 'SquareFootage', 'AnnualPropertyTax', 'Admissions']]

# Encode categorical features ('City' and 'RentalType')
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Check the shape of the splits
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Get model coefficients and intercept
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R² Score:", r2)

from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

# Encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Initialize the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Define a custom scorer for MSE
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Perform k-fold cross-validation (k=5)
scores = cross_val_score(model, X, y, cv=5, scoring=mse_scorer)

# Convert negative MSE to positive for interpretability
mse_scores = -scores

# Output results
print("Random Forest Regressor Cross-Validation Results:")
print("Mean Squared Errors for each fold:", mse_scores)
print("Average MSE:", mse_scores.mean())



# Encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Initialize the Linear Regression model
model = LinearRegression()

# Define a custom scorer for MSE
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Perform k-fold cross-validation (k=5)
scores = cross_val_score(model, X, y, cv=5, scoring=mse_scorer)

# Convert negative MSE to positive for interpretability
mse_scores = -scores

# Output results
print("Linear Regression Cross-Validation Results:")
print("Mean Squared Errors for each fold:", mse_scores)
print("Average MSE:", mse_scores.mean())

# Hyperparameter Tuning for Random Forest Regressor 
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


df_combine = pd.merge(df_rent_clean, df_pr_agg, on=['City', 'Year', 'Month'], how='left')
df_combine.drop(['Date', 'Province'], inplace=True, axis=1)  # Drop non-important columns

# Ensure categorical columns are consistent
for col in df_combine.select_dtypes(include=['object', 'category']).columns:
    df_combine[col] = df_combine[col].astype(str)

# Set the actual target variable name
target_variable = 'MonthlyRent'  # Replace with the correct target column name

# Identify categorical and numerical columns
categorical_columns = df_combine.select_dtypes(include=['object']).columns.tolist()
numerical_columns = df_combine.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Exclude the target variable from the features
numerical_columns = [col for col in numerical_columns if col != target_variable]

# Separate features and target
X = df_combine.drop(columns=[target_variable])
y = df_combine[target_variable]

# Define preprocessing for categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'  # Keep numerical columns untouched
)

# Define the Random Forest model
model = RandomForestRegressor(random_state=42)

# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Define the parameter grid for tuning
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Display the best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Negative Mean Squared Error:", grid_search.best_score_)

# Train the model with the best parameters
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Set Score (R^2):", test_score)

## Testing the best model 

y_pred = best_model.predict(X_test) 
residuals = y_test - y_pred  

fig = plt.figure(figsize=(10, 6))  
fig.subplots_adjust(hspace=.5)
ax1 = fig.add_subplot(211) 
ax2 = fig.add_subplot(212)
ax1.scatter(y_pred, residuals, alpha=0.6)
ax1.axhline(y=0, color='r', linestyle='--')
ax1.set_xlabel('Predicted Values')
ax1.set_ylabel('Residuals')
ax1.set_title('Residual Plot')

ax2.hist(residuals, bins = 35,)
ax2.set_xlabel('Predicted Values')
ax2.set_title('Residuals Histogram')

fig.show()
 
 
 ## Checking for overfittng  

# Evaluate R^2 on the training set
train_score = best_model.score(X_train, y_train)

# Print train and test scores
print("Training Set Score (R^2):", train_score)
print("Test Set Score (R^2):", test_score)

## Checking for feature importances 
import numpy as np 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Get the feature names after preprocessing
preprocessor = best_model.named_steps['preprocessor']
ohe = preprocessor.named_transformers_['cat']

# Extracting names for one-hot encoded features
ohe_feature_names = ohe.get_feature_names_out(categorical_columns)

# Combine numerical and one-hot encoded feature names
all_feature_names = list(ohe_feature_names) + numerical_columns

# Extract feature importance from the Random Forest model
importances = best_model.named_steps['model'].feature_importances_

# Sort and plot feature importance
import numpy as np
import matplotlib.pyplot as plt

sorted_indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))

plt.bar(range(len(importances)), importances[sorted_indices], align='center')
plt.xlim((0,10))
plt.xticks(range(len(importances)), np.array(all_feature_names)[sorted_indices], rotation=90)
plt.title('Feature Importance')
plt.show()

## Predicting Prices for 2024 and 2025 ##

## Predicting for 2024 and 2025 

X_2425 = df_combine_2425.drop(columns=['MonthlyRent']) 
predict_2425 = best_model.predict(X_2425) 

df_predict = X_2425.copy() 
df_predict['MonthlyRent'] = predict_2425
df_predict.head() 
df_predict_group = df_predict.groupby(by=['City','AdmissionsYear','Month'], as_index=False).agg({'MonthlyRent':'mean'}).rename(columns={'AdmissionsYear':'Year'}) 
df_predict_group['Source'] ='Prediction'
df_predict_group.head()

## Combining historic rent data with predicted  
df_combine_group = df_combine.groupby(by=['City','Year','Month'], as_index=False).agg({'MonthlyRent':'mean'})
df_combine_group['Source'] = 'Historic'
df_combine_group.head()

## combining historic and prediction  
combined = pd.concat([df_predict_group,df_combine_group]) 
#combined =  pd.concat([df_combine,df_predict])
combined['Year-Month'] = pd.to_datetime(combined['Year'].astype(str)+"-"+combined['Month'].astype(str),yearfirst=True)
combined.sort_values(by='Year-Month', ascending=True, inplace=True)
combined.reset_index(drop=True, inplace=True)
combined.tail()

## preparing Admissions data for plot 
df_pr['Year-Month'] = pd.to_datetime(df_pr['Year'].astype(str)+"-"+df_pr['Month'].astype(str),yearfirst=True) 
df_pr.sort_values(by='Year-Month', ascending=True, inplace=True)
df_pr.reset_index(drop= True,inplace=True)
df_pr.head()

## Plotting data for the most populous cities in Canada


for city in ['Toronto','Montreal','Vancouver','Calgary','Edmonton','Ottawa']: 
    fig = plt.figure(figsize=(15,5))
    fig.suptitle(city) 
    ax1 = fig.add_subplot(121)
    ax2 = fig.add_subplot(122) 
 
    #ax2.title('Rentals in {}'.format(city))
    pr_plot = df_pr.loc[(df_pr['City']==city)&(df_pr['Year']>=2019),['Year-Month','Admissions']]
    pr_plot.plot(x='Year-Month', y='Admissions', ax= ax1)  
    combined_plot = combined.loc[(combined['City']==city),['Year-Month','MonthlyRent']]
    combined_plot.plot(x='Year-Month', y='MonthlyRent',ax= ax2)  
    ax2.axvspan('2024-01', combined_plot['Year-Month'].max(), color='lightgray', alpha=0.5, label='After 2024')