# Python code used for this research #
##############################################################
## SECTION 1: Import, Clean-Up, and Preparation of the Data ##
##############################################################
## Import data ##
from google.colab import files
uploaded = files.upload()
# steps:
#1.Download two files to the local: 'synthetic_rental_prices_canada_updated.csv' and 'PR_Admissions_unpivoted_added_2025.xlsx'
#2.upload the files to the files under this colab notebook
# read csv file with rent by city
import pandas as pd
csv_file = 'synthetic_rental_prices_canada_updated.csv'
df_rental = pd.read_csv(csv_file)
df_rental.tail()
# read xlsx file for the pr data with city and date
xlsx_file = 'PR_Admissions_unpivoted_added_2025.xlsx'
df_pr= pd.read_excel(xlsx_file, dtype = {'Provice':str, 'City': str,'Date':str, 'Admissions':int, 'Month': str, 'Year': int})
df_pr.tail()
#date column is missing values but is redundant so will be dropped
df_pr.info()
## Check whether the values of province or city are matching between two dataframes ##
# Extract unique values
unique_cities_df_rental = set(df_rental['City'].dropna().unique())
unique_cities_df_pr = set(df_pr['City'].dropna().unique())
# Compare
matching_cities = unique_cities_df_rental.intersection(unique_cities_df_pr)
only_in_df_rental = unique_cities_df_rental - unique_cities_df_pr
only_in_df_pr = unique_cities_df_pr - unique_cities_df_rental
print("Matching Cities:", matching_cities)
print("Cities only in df_rental:", only_in_df_rental)
print("Cities only in df_pr:", only_in_df_pr)
# Cities in df_pr with {'Montreal', 'Ottawa'}, in df_pr {'Montréal','Ottawa - Gatineau (Ontario part)', 'Ottawa - Gatineau (Quebec part)'}
# Next: df_pr, rename 'Montréal' -> 'Montreal',
# combine 'Ottawa - Gatineau (Quebec part)' and 'Ottawa - Gatineau (Ontario part)', then rename to 'Ottawa',
# drop all other cities not in the 7 in th df_rental
df_pr['City'] = df_pr['City'].replace({
'Montréal': 'Montreal',
'Ottawa - Gatineau (Ontario part)': 'Ottawa',
'Ottawa - Gatineau (Quebec part)':'Ottawa'
}) # rename the values in the 'City' column
target_cities = ['Vancouver', 'Edmonton', 'Toronto', 'Calgary', 'Halifax','Montreal', 'Ottawa']
# Drop rows where 'City' is not in the list
df_pr_clean = df_pr[df_pr['City'].isin(target_cities)].reset_index(drop =1)
df_pr_clean.City.unique()
df_pr_clean.tail()
df_pr_agg = df_pr_clean.groupby(['Province', 'City', 'Month', 'Year'], as_index=False)['Admissions'].sum()
df_pr_agg.tail()
month_mapping = {
'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
}
# Map the Month column
df_pr_agg['Month'] = df_pr_agg['Month'].map(month_mapping)
df_pr_agg['Month'] = pd.to_numeric(df_pr_agg['Month'])
df_pr_agg.tail()
## Data Mining: Explore the impact columns to the rent##
df_explore = df_rental.copy()
df_rental.dtypes
df_explore.columns = df_explore.columns.str.replace(' ', '', regex=False) #remove space in the column name
df_explore.dtypes
dep_var = ['MonthlyRent']
categorical_vars = ['City', 'RentalType', 'Furnished','PetFriendly','ParkingIncluded','InternetAvailability']
discrete_vars = ['Year', 'Month', 'Bedrooms', 'SquareFootage', 'WalkScore',
'TransitScore', 'BuildingAge', 'LeaseTerm']
continuous_vars = ['DistancetoCityCenter(km)', 'EnergyEfficiencyRating', 'NoiseLevel','NearbySchoolsRating','CrimeRateIndex','AnnualPropertyTax']
df_explore.loc[:, dep_var + discrete_vars + continuous_vars].describe()
import seaborn as sns
sns.displot(df_explore['MonthlyRent'])
import matplotlib.pyplot as plt
df_explore.loc[:, discrete_vars + continuous_vars].hist(bins=50, figsize=(20,15))
plt.show()
corr_matrix = df_explore.loc[:, ['MonthlyRent'] + continuous_vars].corr(method='pearson')
corr_matrix['MonthlyRent'].sort_values(ascending=False)
corr_matrix = df_explore.loc[:, ['MonthlyRent'] + discrete_vars].corr(method='pearson')
corr_matrix['MonthlyRent'].sort_values(ascending=False)
sns.pairplot(data=df_explore.loc[:, ['MonthlyRent'] + continuous_vars])
plt.show()
sns.pairplot(data=df_explore.loc[:, ['MonthlyRent'] + discrete_vars])
plt.show()
for item in [x for x in categorical_vars ]:
plt.figure(figsize=(10,5))
sns.boxplot(x=item, y='MonthlyRent', data=df_explore)
if len(df_explore[item].unique()) > 4:
plt.xticks(rotation=45)
plt.show()
df_rent_clean = df_explore[['City', 'RentalType', 'Year', 'Month', 'Bedrooms', 'SquareFootage','AnnualPropertyTax','MonthlyRent']]
df_rent_clean.tail()
## Join the rent and pr dataframes ##
df_combine = pd.merge(df_rent_clean, df_pr_agg, on=['City', 'Year', 'Month'], how='left')
df_combine.drop(['Province'], inplace=True, axis=1) #drop non-important columns from df_pr_agg
df_combine
#convert admissions to float
#df_combine['Admissions'] = df_combine['Admissions'].str.replace(',', '').astype(float)
#df_combine = df_combine[df_combine['Admissions'].notna()]
#df_combine.tail()
## Prepare 2025 rental prices data for prediction ##
df_rent_clean['Year'].unique()
df_rent_clean_2024 = df_rent_clean.copy()
df_rent_clean_2024 = df_rent_clean_2024.loc[df_rent_clean_2024['Year'] == 2023, :]
df_rent_clean_2024['AdmissionsYear'] = 2024
df_rent_clean_2025 = df_rent_clean.copy()
df_rent_clean_2025 = df_rent_clean_2025.loc[df_rent_clean_2025['Year'] == 2023, :]
df_rent_clean_2025['AdmissionsYear'] = 2025
df_rent_clean_2025.head()
#rename df_pr_agg['Year'] to 'AdmissionsYear' so join works
df_pr_agg2025=df_pr_agg.rename(columns={'Year': 'AdmissionsYear'})
df_pr_agg2025.tail()
#Join 2024 Rental Listings with 2025 PR Admissions to predict Monthly Rent in 2025
df_combine_2024 = df_rent_clean_2024.merge(df_pr_agg2025, on=['City', 'AdmissionsYear', 'Month'], how='left')
df_combine_2024.drop(['Province'], inplace=True, axis=1) #drop non-important columns from df_pr_agg
df_combine_2024 = df_combine_2024[df_combine_2024['Admissions'].notna()]
df_combine_2025 = df_rent_clean_2025.merge(df_pr_agg2025, on=['City', 'AdmissionsYear', 'Month'], how='left')
df_combine_2025.drop(['Province'], inplace=True, axis=1) #drop non-important columns from df_pr_agg
df_combine_2025 = df_combine_2025[df_combine_2025['Admissions'].notna()] #PR admissions data only present until Sept of 2024. Drop records past sept 2025
df_combine_2025
df_combine_2425 = pd.concat([df_combine_2024, df_combine_2025])
df_combine_2425.head()
###############################
## SECTION 2: Pre-Processing ##
###############################
## Create Pre-processing pipeline for categorical encoding and standardized scaling ##
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
dep_var = ['MonthlyRent']
categorical_vars_comb = ['City', 'RentalType']
discrete_vars_comb = ['Year', 'Month', 'Bedrooms', 'SquareFootage','Admissions']
continuous_vars_comb = ['AnnualPropertyTax']
# Categorical - one hot encode
cat_ohe_step = ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
cat_steps = [cat_ohe_step]
cat_pipe = Pipeline(cat_steps)
cat_transformers = [('cat', cat_pipe, categorical_vars_comb)]
# Numerical - scale
num_scl_step = ('scl', StandardScaler())
num_steps = [num_scl_step]
num_pipe = Pipeline(num_steps)
num_transformers = [('num', num_pipe, discrete_vars_comb + continuous_vars_comb)]
ct = ColumnTransformer(transformers=cat_transformers + num_transformers)
ct.fit(df_combine[categorical_vars_comb + discrete_vars_comb + continuous_vars_comb])
X=ct.transform(df_combine[categorical_vars_comb + discrete_vars_comb + continuous_vars_comb])
y=df_combine[['MonthlyRent']].values
X
## Train-test split ##
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
################################
## SECTION 3: Model Selection ##
################################
## Regression Models ##
#import pandas as pd
#from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Define target (y) and features (X)
y = df_combine['MonthlyRent'] # Target variable
# Features to include
X = df_combine[['City', 'RentalType', 'Year', 'Month', 'Bedrooms', 'SquareFootage', 'AnnualPropertyTax', 'Admissions']]
# Encode categorical features ('City' and 'RentalType')
X = pd.get_dummies(X, drop_first=True)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Check the shape of the splits
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
# Initialize the Linear Regression model
model = LinearRegression()
# Train the model
model.fit(X_train, y_train)
# Get model coefficients and intercept
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R² Score:", r2)
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
# Encode categorical features
X = pd.get_dummies(X, drop_first=True)
# Initialize the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=0)
# Define a custom scorer for MSE
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
# Perform k-fold cross-validation (k=5)
scores = cross_val_score(model, X, y, cv=5, scoring=mse_scorer)
# Convert negative MSE to positive for interpretability
mse_scores = -scores
# Output results
print("Random Forest Regressor Cross-Validation Results:")
print("Mean Squared Errors for each fold:", mse_scores)
print("Average MSE:", mse_scores.mean())
# Encode categorical features
X = pd.get_dummies(X, drop_first=True)
# Initialize the Linear Regression model
model = LinearRegression()
# Define a custom scorer for MSE
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
# Perform k-fold cross-validation (k=5)
scores = cross_val_score(model, X, y, cv=5, scoring=mse_scorer)
# Convert negative MSE to positive for interpretability
mse_scores = -scores
# Output results
print("Linear Regression Cross-Validation Results:")
print("Mean Squared Errors for each fold:", mse_scores)
print("Average MSE:", mse_scores.mean())
# Hyperparameter Tuning for Random Forest Regressor
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
df_combine = pd.merge(df_rent_clean, df_pr_agg, on=['City', 'Year', 'Month'], how='left')
df_combine.drop(['Date', 'Province'], inplace=True, axis=1) # Drop non-important columns
# Ensure categorical columns are consistent
for col in df_combine.select_dtypes(include=['object', 'category']).columns:
df_combine[col] = df_combine[col].astype(str)
# Set the actual target variable name
target_variable = 'MonthlyRent' # Replace with the correct target column name
# Identify categorical and numerical columns
categorical_columns = df_combine.select_dtypes(include=['object']).columns.tolist()
numerical_columns = df_combine.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Exclude the target variable from the features
numerical_columns = [col for col in numerical_columns if col != target_variable]
# Separate features and target
X = df_combine.drop(columns=[target_variable])
y = df_combine[target_variable]
# Define preprocessing for categorical data
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
],
remainder='passthrough' # Keep numerical columns untouched
)
# Define the Random Forest model
model = RandomForestRegressor(random_state=42)
# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
# Define the parameter grid for tuning
param_grid = {
'model__n_estimators': [50, 100, 200],
'model__max_depth': [10, 20, None],
'model__min_samples_split': [2, 5, 10],
'model__min_samples_leaf': [1, 2, 4]
}
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)
# Display the best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Negative Mean Squared Error:", grid_search.best_score_)
# Train the model with the best parameters
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Set Score (R^2):", test_score)
## Testing the best model
y_pred = best_model.predict(X_test)
residuals = y_test - y_pred
fig = plt.figure(figsize=(10, 6))
fig.subplots_adjust(hspace=.5)
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)
ax1.scatter(y_pred, residuals, alpha=0.6)
ax1.axhline(y=0, color='r', linestyle='--')
ax1.set_xlabel('Predicted Values')
ax1.set_ylabel('Residuals')
ax1.set_title('Residual Plot')
ax2.hist(residuals, bins = 35,)
ax2.set_xlabel('Predicted Values')
ax2.set_title('Residuals Histogram')
fig.show()
## Checking for overfittng
# Evaluate R^2 on the training set
train_score = best_model.score(X_train, y_train)
# Print train and test scores
print("Training Set Score (R^2):", train_score)
print("Test Set Score (R^2):", test_score)
## Checking for feature importances
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Get the feature names after preprocessing
preprocessor = best_model.named_steps['preprocessor']
ohe = preprocessor.named_transformers_['cat']
# Extracting names for one-hot encoded features
ohe_feature_names = ohe.get_feature_names_out(categorical_columns)
# Combine numerical and one-hot encoded feature names
all_feature_names = list(ohe_feature_names) + numerical_columns
# Extract feature importance from the Random Forest model
importances = best_model.named_steps['model'].feature_importances_
# Sort and plot feature importance
import numpy as np
import matplotlib.pyplot as plt
sorted_indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances[sorted_indices], align='center')
plt.xlim((0,10))
plt.xticks(range(len(importances)), np.array(all_feature_names)[sorted_indices], rotation=90)
plt.title('Feature Importance')
plt.show()
## Predicting Prices for 2024 and 2025 ##
## Predicting for 2024 and 2025
X_2425 = df_combine_2425.drop(columns=['MonthlyRent'])
predict_2425 = best_model.predict(X_2425)
df_predict = X_2425.copy()
df_predict['MonthlyRent'] = predict_2425
df_predict.head()
df_predict_group = df_predict.groupby(by=['City','AdmissionsYear','Month'], as_index=False).agg({'MonthlyRent':'mean'}).rename(columns={'AdmissionsYear':'Year'})
df_predict_group['Source'] ='Prediction'
df_predict_group.head()
## Combining historic rent data with predicted
df_combine_group = df_combine.groupby(by=['City','Year','Month'], as_index=False).agg({'MonthlyRent':'mean'})
df_combine_group['Source'] = 'Historic'
df_combine_group.head()
## combining historic and prediction
combined = pd.concat([df_predict_group,df_combine_group])
#combined = pd.concat([df_combine,df_predict])
combined['Year-Month'] = pd.to_datetime(combined['Year'].astype(str)+"-"+combined['Month'].astype(str),yearfirst=True)
combined.sort_values(by='Year-Month', ascending=True, inplace=True)
combined.reset_index(drop=True, inplace=True)
combined.tail()
## preparing Admissions data for plot
df_pr['Year-Month'] = pd.to_datetime(df_pr['Year'].astype(str)+"-"+df_pr['Month'].astype(str),yearfirst=True)
df_pr.sort_values(by='Year-Month', ascending=True, inplace=True)
df_pr.reset_index(drop= True,inplace=True)
df_pr.head()
## Plotting data for the most populous cities in Canada
for city in ['Toronto','Montreal','Vancouver','Calgary','Edmonton','Ottawa']:
fig = plt.figure(figsize=(15,5))
fig.suptitle(city)
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
#ax2.title('Rentals in {}'.format(city))
pr_plot = df_pr.loc[(df_pr['City']==city)&(df_pr['Year']>=2019),['Year-Month','Admissions']]
pr_plot.plot(x='Year-Month', y='Admissions', ax= ax1)
combined_plot = combined.loc[(combined['City']==city),['Year-Month','MonthlyRent']]
combined_plot.plot(x='Year-Month', y='MonthlyRent',ax= ax2)
ax2.axvspan('2024-01', combined_plot['Year-Month'].max(), color='lightgray', alpha=0.5, label='After 2024')