!pip install -q numpy
!pip install -q pandas
!pip install -q scikit-learn
!pip install -q matplotlib
!pip install -q seaborn


# Importing necessary libraries
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import iqr


# Importing necessary functions
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.linear_model import LassoCV, RidgeCV
from scipy.stats import probplot


houses_cleaned = pd.read_csv('houses_cleaned.csv')


houses_cleaned.head()


houses_cleaned.tail()


houses_cleaned.describe()


# Applying one-hot encoding to the 'ocean_proximity' column
houses_cleaned = pd.get_dummies(houses_cleaned, columns=['ocean_proximity'])

houses_cleaned.head()


print(houses_cleaned.describe())

          longitude      latitude  housing_median_age   total_rooms  \
count  20635.000000  20635.000000        20635.000000  20635.000000   
mean    -119.569999     35.632412           28.636152   2441.902575   
std        2.003685      2.135918           12.583924   1397.859491   
min     -124.350000     32.540000            1.000000      2.000000   
25%     -121.800000     33.930000           18.000000   1448.000000   
50%     -118.500000     34.260000           29.000000   2127.000000   
75%     -118.010000     37.710000           37.000000   3148.000000   
max     -114.310000     41.950000           52.000000   5698.375000   

       total_bedrooms    population    households  median_income  \
count    20635.000000  20635.000000  20635.000000   20635.000000   
mean       502.689858   1337.121105    469.066731       3.801266   
std        287.261212    765.561218    265.518130       1.657765   
min       -112.080009      3.000000      1.000000       0.499900   
25%        296.000000    787.000000    280.000000       2.563100   
50%        435.000000   1166.000000    409.000000       3.535200   
75%        647.000000   1725.000000    605.000000       4.743700   
max       1173.500000   3132.000000   1092.500000       8.013025   

       median_house_value  rooms_per_household  log_median_income  
count        20635.000000         20635.000000       20635.000000  
mean        205938.952435             5.304655           1.510583  
std         113192.930024             1.246199           0.342970  
min          14999.000000             2.023219           0.405398  
25%         119600.000000             4.440684           1.270631  
50%         179700.000000             5.229091           1.511869  
75%         264700.000000             6.052257           1.748104  
max         482412.500000             8.469878           2.198671


# Assuming 'median_house_value' is the target variable
target = 'median_house_value'

# separating the data into features and target
X = houses_cleaned.drop(target, axis=1) # axis = 1 refers to columns
y = houses_cleaned[target]


# Splitting the dataset into features (X) and target variable (y)
y_log_transformed = np.log(houses_cleaned['median_house_value'])

# Using 'train_test_split' to divide the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y_log_transformed, test_size=0.2, random_state=42)

# Confirming the split by examining the initial rows of the training set
print(X_train.head())
print(y_train.head())

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
19980    -119.31     36.20                23.0       1837.0           332.0   
11231    -117.96     33.81                34.0       1416.0           277.0   
2271     -119.80     36.78                43.0       2382.0           431.0   
8658     -118.38     33.83                40.0       3070.0           570.0   
6248     -117.97     34.05                34.0       2050.0           495.0   

       population  households  median_income  rooms_per_household  \
19980      1064.0       335.0         3.1453             5.483582   
11231       980.0       284.0         4.7772             4.985915   
2271        874.0       380.0         3.5542             6.268421   
8658       1264.0       506.0         5.1626             6.067194   
6248       1832.0       465.0         2.8333             4.408602   

       log_median_income  ocean_proximity_<1h_ocean  ocean_proximity_inland  \
19980           1.421975                      False                    True   
11231           1.753919                       True                   False   
2271            1.516050                      False                    True   
8658            1.818499                       True                   False   
6248            1.343726                       True                   False   

       ocean_proximity_near_bay  ocean_proximity_near_ocean  
19980                     False                       False  
11231                     False                       False  
2271                      False                       False  
8658                      False                       False  
6248                      False                       False  
19980    11.218554
11231    12.114505
2271     11.477298
8658     12.977800
6248     11.955686
Name: median_house_value, dtype: float64


# Verify the dimensions
print("X_train {} and size {}".format(X_train.shape, X_train.size))
print("X_test {} and size {}".format(X_test.shape, X_test.size))
print("y_train {} and size {}".format(y_train.shape, y_train.size))
print("y_test {} and size {}".format(y_test.shape, y_test.size))

X_train (16508, 14) and size 231112
X_test (4127, 14) and size 57778
y_train (16508,) and size 16508
y_test (4127,) and size 4127


# Define the standard scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Create a linear regression model
Linear_Regression = LinearRegression()

# Train the model on the scaled training data
Linear_Regression.fit(X_train_scaled, y_train)

LinearRegression()

LinearRegression()


print("Intercept:", Linear_Regression.intercept_)

coef_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': Linear_Regression.coef_
})
print(coef_df)

Intercept: 12.083924657893421
                       Feature  Coefficient
0                    longitude    -0.306512
1                     latitude    -0.318071
2           housing_median_age     0.047750
3                  total_rooms    -0.128552
4               total_bedrooms     0.282053
5                   population    -0.211067
6                   households     0.089760
7                median_income     0.281451
8          rooms_per_household     0.000819
9            log_median_income     0.086619
10   ocean_proximity_<1h_ocean     0.049764
11      ocean_proximity_inland    -0.078587
12    ocean_proximity_near_bay     0.013416
13  ocean_proximity_near_ocean     0.022889


y_pred = Linear_Regression.predict(X_test_scaled)
print("R-squared for Linear_Regression:", r2_score(y_test, y_pred))
print("MSE for Linear_Regression:", mean_squared_error(y_test, y_pred))

R-squared for Linear_Regression: 0.7035479323877921
MSE for Linear_Regression: 0.09551544903125564


# Prepare LassoCV with a range of alphas for cross-validation
alphas=np.logspace(-4, 4, 100)
lasso_model = LassoCV(alphas=alphas, cv=5)


lasso_model.fit(X_train_scaled, y_train)

# Extract the best alpha parameter
best_alpha = lasso_model.alpha_
print("Best alpha:", best_alpha)

Best alpha: 0.0001


# Plot alpha values against mean squared error
plt.figure(figsize=(10, 6))
plt.plot(lasso_model.alphas_, lasso_model.mse_path_.mean(axis = 1), marker='o')
plt.axvline(lasso_model.alpha_, linestyle = "--")
plt.xscale('log')
plt.xlabel('Alpha')
plt.ylabel('Mean Squared Error')
plt.title('Mean Squared Error vs. Alpha for Lasso')
plt.legend(["MSE per alpha", "Optimal alpha"])
plt.grid(True)
plt.show()


# Predict on the test set
lasso_y_pred = lasso_model.predict(X_test_scaled)

# Calculate the Mean Squared Error and R-squared for evaluation
lasso_mse = mean_squared_error(y_test, lasso_y_pred)
lasso_r2 = r2_score(y_test, lasso_y_pred)

print("Lasso Regression MSE:", lasso_mse)
print(f"R-squared for Lasso: {lasso_r2:.10f}")

Lasso Regression MSE: 0.09551612828131519
R-squared for Lasso: 0.7035458242


# Prepare RidgeCV with a range of alphas for cross-validation
ridge_model = RidgeCV(alphas = alphas, cv=5)

# Fit the RidgeCV model to find the best alpha
ridge_model.fit(X_train_scaled, y_train)

# Extract the best alpha value
optimal_alpha_ridge = ridge_model.alpha_

print("Optimal Alpha for Ridge:", optimal_alpha_ridge)

Optimal Alpha for Ridge: 3.351602650938848


# Predict on the test set
ridge_y_pred = ridge_model.predict(X_test_scaled)


# Calculate the Mean Squared Error and R-squared for evaluation
ridge_mse = mean_squared_error(y_test, ridge_y_pred)
ridge_r2 = r2_score(y_test, ridge_y_pred)

print("Ridge Regression MSE:", ridge_mse)
print(f"R-squared for Ridge: {ridge_r2:.10f}")

Ridge Regression MSE: 0.09551536817719684
R-squared for Ridge: 0.7035481833


plt.figure(figsize=(10, 4))

# Lasso Regression Model Fit Plot
plt.subplot(1, 2, 1)
sns.regplot(x=y_test, y=lasso_y_pred, scatter_kws={'alpha':0.5, 'color': 'lightblue'}, line_kws={'color': 'black', 'linestyle': '--', 'linewidth': 2.5})
plt.xlabel('Actual Values', fontsize=8)
plt.ylabel('Predicted by Lasso', fontsize=8)
plt.title('Model Fit: Lasso Regression', fontsize=10)

# Ridge Regression Model Fit Plot
plt.subplot(1, 2, 2)
sns.regplot(x=y_test, y=ridge_y_pred, scatter_kws={'alpha':0.5, 'color': 'lightblue'}, line_kws={'color': 'black', 'linestyle': '--', 'linewidth': 2.5})
plt.xlabel('Actual Values', fontsize=8)
plt.ylabel('Predicted by Ridge', fontsize=8)
plt.title('Model Fit: Ridge Regression', fontsize=10)


plt.tight_layout()
plt.show()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity	rooms_per_household	log_median_income
0	-122.23	37.88	41.0	880.000	129.0	322.0	126.0	8.013025	452600.0	near_bay	6.984127	2.198671
1	-122.22	37.86	21.0	5698.375	1106.0	2401.0	1092.5	8.013025	358500.0	near_bay	6.238137	2.198671
2	-122.24	37.85	52.0	1467.000	190.0	496.0	177.0	7.257400	352100.0	near_bay	8.288136	2.111110
3	-122.25	37.85	52.0	1274.000	235.0	558.0	219.0	5.643100	341300.0	near_bay	5.817352	1.893579
4	-122.25	37.85	52.0	1627.000	280.0	565.0	259.0	3.846200	342200.0	near_bay	6.281853	1.578195

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity	rooms_per_household	log_median_income
20630	-121.09	39.48	25.0	1665.0	374.0	845.0	330.0	1.5603	78100.0	inland	5.045455	0.940124
20631	-121.21	39.49	18.0	697.0	150.0	356.0	114.0	2.5568	77100.0	inland	6.114035	1.268861
20632	-121.22	39.43	17.0	2254.0	485.0	1007.0	433.0	1.7000	92300.0	inland	5.205543	0.993252
20633	-121.32	39.43	18.0	1860.0	409.0	741.0	349.0	1.8672	84700.0	inland	5.329513	1.053336
20634	-121.24	39.37	16.0	2785.0	616.0	1387.0	530.0	2.3886	89400.0	inland	5.254717	1.220417

Practical 2: Basic ML & Linear Regression¶

Learning goals¶

Load the data and set up¶

One-Hot encoding¶

Separating target and features¶

Splitting the dataset into training and test sets¶

Feature scaling¶

Building prediction model¶

Prediction and evaluation¶

Lasso and Ridge Regularization¶

Comparing lasso and ridge regression models¶

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	rooms_per_household	log_median_income
count	20635.000000	20635.000000	20635.000000	20635.000000	20635.000000	20635.000000	20635.000000	20635.000000	20635.000000	20635.000000	20635.000000
mean	-119.569999	35.632412	28.636152	2441.902575	502.689858	1337.121105	469.066731	3.801266	205938.952435	5.304655	1.510583
std	2.003685	2.135918	12.583924	1397.859491	287.261212	765.561218	265.518130	1.657765	113192.930024	1.246199	0.342970
min	-124.350000	32.540000	1.000000	2.000000	-112.080009	3.000000	1.000000	0.499900	14999.000000	2.023219	0.405398
25%	-121.800000	33.930000	18.000000	1448.000000	296.000000	787.000000	280.000000	2.563100	119600.000000	4.440684	1.270631
50%	-118.500000	34.260000	29.000000	2127.000000	435.000000	1166.000000	409.000000	3.535200	179700.000000	5.229091	1.511869
75%	-118.010000	37.710000	37.000000	3148.000000	647.000000	1725.000000	605.000000	4.743700	264700.000000	6.052257	1.748104
max	-114.310000	41.950000	52.000000	5698.375000	1173.500000	3132.000000	1092.500000	8.013025	482412.500000	8.469878	2.198671