!pip install -q numpy
!pip install -q pandas
!pip install -q scikit-learn
!pip install -q matplotlib
!pip install -q seaborn


import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler


#code to mount google drive
#from google.colab import drive
#drive.mount('/content/drive')

houses = pd.read_csv("housing.csv")


#print the first lines of the dataset
houses.head()


#print the last lines of the dataset
houses.tail()


print(houses.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None


print(houses.describe())

          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861           28.639486   2635.763081   
std        2.003532      2.135952           12.585558   2181.615252   
min     -124.350000     32.540000            1.000000      2.000000   
25%     -121.800000     33.930000           18.000000   1447.750000   
50%     -118.490000     34.260000           29.000000   2127.000000   
75%     -118.010000     37.710000           37.000000   3148.000000   
max     -114.310000     41.950000           52.000000  39320.000000   

       total_bedrooms    population    households  median_income  \
count    20433.000000  20640.000000  20640.000000   20640.000000   
mean       537.870553   1425.476744    499.539680       3.870671   
std        421.385070   1132.462122    382.329753       1.899822   
min          1.000000      3.000000      1.000000       0.499900   
25%        296.000000    787.000000    280.000000       2.563400   
50%        435.000000   1166.000000    409.000000       3.534800   
75%        647.000000   1725.000000    605.000000       4.743250   
max       6445.000000  35682.000000   6082.000000      15.000100   

       median_house_value  
count        20640.000000  
mean        206855.816909  
std         115395.615874  
min          14999.000000  
25%         119600.000000  
50%         179700.000000  
75%         264725.000000  
max         500001.000000


# Check for missing values
print(houses.isnull().sum())

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


houses.nunique()

longitude               844
latitude                862
housing_median_age       52
total_rooms            5926
total_bedrooms         1923
population             3888
households             1815
median_income         12928
median_house_value     3842
ocean_proximity           5
dtype: int64


count_per_unique_value = houses['ocean_proximity'].value_counts()
print(count_per_unique_value)

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64


houses.drop(houses[houses['ocean_proximity'] == "ISLAND"].index, inplace=True)
# Display the DataFrame to confirm that the rows have been removed
houses


# Converting 'ocean_proximity' to lower case and replacing spaces with underscores
houses['ocean_proximity'] = houses['ocean_proximity'].str.lower().str.replace(" ", "_")

# Displaying the 'ocean_proximity' column to confirm the changes
houses['ocean_proximity']

0        near_bay
1        near_bay
2        near_bay
3        near_bay
4        near_bay
           ...   
20635      inland
20636      inland
20637      inland
20638      inland
20639      inland
Name: ocean_proximity, Length: 20635, dtype: object


houses['high_value'] = (houses['median_house_value'] > houses['median_house_value'].median()).astype(int)
print(houses.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  \
0       322.0       126.0         8.3252            452600.0        near_bay   
1      2401.0      1138.0         8.3014            358500.0        near_bay   
2       496.0       177.0         7.2574            352100.0        near_bay   
3       558.0       219.0         5.6431            341300.0        near_bay   
4       565.0       259.0         3.8462            342200.0        near_bay   

   high_value  
0           1  
1           1  
2           1  
3           1  
4           1


# Group by 'ocean_proximity' and calculate the average median house value for each category
average_prices_by_proximity = houses.groupby('ocean_proximity')['total_rooms'].mean()
print(average_prices_by_proximity)

ocean_proximity
<1h_ocean     2628.343586
inland        2717.742787
near_bay      2493.589520
near_ocean    2583.700903
Name: total_rooms, dtype: float64


# Define a figure of size (6,4)
plt.figure(figsize=(6, 4))
# create the histogram using the histplot
sns.histplot(houses['housing_median_age'], kde=True)
# add different elements such as title and x, y labels
plt.title('Distribution of Median Age')
plt.xlabel('Median Age')
plt.ylabel('Frequency')

# Set the limits on the X and Y axes
x_lim = (0, 60)
y_lim = (0, 5000)
plt.xlim(x_lim)
plt.ylim(y_lim)

plt.show()


n_rows = 6
n_cols = 2

# Creating the figure and subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 6, n_rows * 3))

# Flattening the axes array for easier iteration
axes = axes.flatten()

# Creating histograms for each column in the DataFrame
for ax, column in zip(axes, houses.columns):
    sns.histplot(houses[column], bins=20, ax=ax)
    ax.set_title(f'Histogram of {column}')
    ax.set_xlabel(column)
    ax.set_ylabel('Frequency')

# Hide any extra subplots
for i in range(len(houses.columns), n_rows * n_cols):
    fig.delaxes(axes[i])

# Adjusting the layout and displaying the figure
fig.tight_layout()
plt.show()


# List of columns to check for outliers
columns_to_check = ['total_rooms', 'total_bedrooms', 'population']

# Create a box plot for each specified column
for column in columns_to_check:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=houses[column])
    plt.title(f'Box plot of {column}')
    plt.show()


features = ['total_bedrooms', 'total_rooms', 'households']

# Create the pair plot with a specified height for each plot
sns.pairplot(houses[features], height=3)
# Adjust the layout
plt.subplots_adjust(top=1)
# Display the plot
plt.show()


# Set the size of the figure for the heatmap
plt.figure(figsize=(8, 6))

# Calculate the Spearman correlation matrix and create the heatmap (we specify numeric_only=True)
sns.heatmap(houses.corr(method='spearman', numeric_only=True), annot=True, cmap='magma')

# Add a title to the heatmap
plt.title('Spearman Correlation Among Numeric Features', size=10)

# Display the heatmap
plt.show()


# Set the size of the figure for the heatmap
plt.figure(figsize=(2, 4))

correlation_matrix = houses.corr(method='spearman', numeric_only=True)
sns.heatmap(correlation_matrix[['median_house_value']].sort_values(by='median_house_value', ascending=False), annot=True, cmap='cividis')
plt.title('Correlation of Features with Median House Value',size=10)
plt.show()


# Plotting the scatter plot for latitude and longitude
plt.figure(figsize=(8, 4))
sns.scatterplot(
    data=houses,
    x='longitude',
    y='latitude',
    size='median_house_value',
    hue='median_house_value',
    palette='magma',
    alpha=0.5)

# Customize the plot
plt.legend(title='Median House Value', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title('Median House Value by Geographical Coordinates')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()


# Group by 'ocean_proximity' and calculate the average median house value for each category
average_prices_by_proximity = houses.groupby('ocean_proximity')['median_house_value'].mean()
print(average_prices_by_proximity)

# Create a boxplot to visualize the results
sns.boxplot(x='ocean_proximity', y='median_house_value', data=houses, palette="Set3", hue='ocean_proximity', legend=False)
plt.title('Boxplot of Median House Value by Ocean Proximity')
plt.xlabel('Ocean Proximity')
plt.ylabel('Median House Value')
plt.show()

ocean_proximity
<1h_ocean     240084.285464
inland        124805.392001
near_bay      259212.311790
near_ocean    249433.977427
Name: median_house_value, dtype: float64


# Apply Min-Max Scaling to 'MedInc'
scaler = MinMaxScaler()
houses['MedInc_MinMax'] = scaler.fit_transform(houses[['median_income']])

# Print original and scaled 'MedInc'
print(houses[['median_income', 'MedInc_MinMax']].head())

# Plot the original and scaled distribution of 'MedInc'
plt.figure(figsize=(8, 4))

# Min-Max Scaled distribution
sns.histplot(houses['MedInc_MinMax'], kde=True, color='green')
plt.title('Min-Max Scaled Distribution of MedInc')
plt.xlabel('MedInc (Min-Max Scaled)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

   median_income  MedInc_MinMax
0         8.3252       0.539668
1         8.3014       0.538027
2         7.2574       0.466028
3         5.6431       0.354699
4         3.8462       0.230776


# define standard scaler
scaler = StandardScaler()

houses['MedInc_Standard'] = scaler.fit_transform(houses[['median_income']])

# Print original and scaled 'MedInc'
print(houses[['median_income', 'MedInc_Standard']].head())

# Plot the original and scaled distribution of 'MedInc'
plt.figure(figsize=(8, 4))

# Standard Scaled distribution
sns.histplot(houses['MedInc_Standard'], kde=True, color='green')
plt.title('Standarization of MedInc')
plt.xlabel('MedInc (Standarization)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

   median_income  MedInc_Standard
0         8.3252         2.344450
1         8.3014         2.331923
2         7.2574         1.782425
3         5.6431         0.932756
4         3.8462        -0.013024

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
20635	-121.09	39.48	25.0	1665.0	374.0	845.0	330.0	1.5603	78100.0	INLAND
20636	-121.21	39.49	18.0	697.0	150.0	356.0	114.0	2.5568	77100.0	INLAND
20637	-121.22	39.43	17.0	2254.0	485.0	1007.0	433.0	1.7000	92300.0	INLAND
20638	-121.32	39.43	18.0	1860.0	409.0	741.0	349.0	1.8672	84700.0	INLAND
20639	-121.24	39.37	16.0	2785.0	616.0	1387.0	530.0	2.3886	89400.0	INLAND

Practical : Data exploration and visualization¶

Objectives¶

Let's get started!¶

Load the data¶

Displaying summary statistics¶

Remove/edit data from the dataframe¶

Add columns to the dataframe¶

Exploratory analysis (graphical)¶

Axes in Matplotlib¶

Subplots in Matplotlib¶

Viewing Correlations¶

Feature Transformation¶

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	NEAR BAY
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	NEAR BAY
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	NEAR BAY
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	NEAR BAY