Part B.2 Active Learning Strategies¶

Right now we have been using a real-world dataset IMDB. However, to incorporate more of the understanding of Active Learning we are going to use a synthetic dataset, such that visualisation will be a bit easier to do.

In [2]:
from sklearn.datasets import make_blobs
import numpy as np
import matplotlib as mlp
import matplotlib.pyplot as plt
import pandas as pd
import re
import string
import skactiveml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from skactiveml.classifier import SklearnClassifier, ParzenWindowClassifier
from skactiveml.pool import UncertaintySampling, ProbabilisticAL, RandomSampling
from skactiveml.pool.multiannotator import SingleAnnotatorWrapper
from skactiveml.stream import StreamRandomSampling, StreamProbabilisticAL
from skactiveml.utils import unlabeled_indices, labeled_indices, MISSING_LABEL, majority_vote, call_func
from skactiveml.visualization import plot_utilities, plot_decision_boundary
from collections import deque
from scipy.ndimage import gaussian_filter1d
from sklearn.manifold import TSNE

import warnings
mlp.rcParams["figure.facecolor"] = "white"
warnings.filterwarnings("ignore")
In [3]:
random_state = np.random.RandomState(0)

# Build a dataset.
X, y_true = make_blobs(
    n_samples=200,
    n_features=2,
    centers=[[0, 1], [-3, 0.5], [-1, -1], [2, 1], [1, -0.5]],
    cluster_std=0.7,
    random_state=random_state,
)
y_true = y_true % 2
y = np.full(shape=y_true.shape, fill_value=MISSING_LABEL)

plt.figure(figsize=(8, 6))
scatter = plt.scatter(X[:, 0], X[:, 1], c=y_true, cmap="coolwarm", edgecolor="k", s=60)
plt.title("make blobs")
plt.xlabel("Feature A")
plt.ylabel("Feature B")
plt.grid(True)
plt.colorbar(scatter, ticks=[0, 1], label="Label")
plt.show()

Now by creating this dataset we can see there is a cluster in the middle and two clusters on the left and right side.

7. Make an Active Learning Loop with this dataset and include in there the following strategies: Uncertainty Sampling, Query By Committee, Expected Error Reduction and Probabilistic Active Learning. Use the ParzenWindowClassifier as your clf

In [ ]:
from skactiveml.pool import MonteCarloEER, QueryByCommittee

# Initialise the classifier.
clf = ParzenWindowClassifier(classes=[0, 1], random_state=random_state)

# Initialise the query strategy.
# qs = MonteCarloEER(method='misclassification_loss')
# qs = ProbabilisticAL()
qs = UncertaintySampling(method='entropy', random_state=42)
# qs = QueryByCommittee(method='KL_divergence', sample_predictions_method_name='sample_proba', sample_predictions_dict={'n_samples': 100})

# if qs == ProbabilisticAL():
#     dens_est = clf.fit(X, np.zeros(len(X)))
#     dens = clf.predict_freq(X)[:, 0]
In [5]:
# Active learning cycle:
n_cycles = 20
for i in range(n_cycles):
    # Fit the classifier with current labels.
    clf.fit(X, y)

    # Query the next sample(s).
    if isinstance(qs, ProbabilisticAL):
        query_idx = qs.query(X=X, y=y, clf=clf, utility_weight=dens)
    elif isinstance(qs,MonteCarloEER):
        query_idx = qs.query(X=X, y=y, clf=clf, ignore_partial_fit=True)
    elif isinstance(qs, UncertaintySampling):
        query_idx = qs.query(X=X, y=y, clf=clf)
    elif isinstance(qs,QueryByCommittee):
        query_idx = qs.query(X=X, y=y, ensemble=clf)

    # Update labels based on query.
    y[query_idx] = y_true[query_idx]

    # Evaluate the classifier on the test set
    y_pred = clf.predict(X)
    acc = accuracy_score(y_true, y_pred)
    print(f'Simple Evaluation Iteration {i + 1}/{n_cycles}, Accuracy: {acc:.4f}')
Simple Evaluation Iteration 1/20, Accuracy: 0.5250
Simple Evaluation Iteration 2/20, Accuracy: 0.6000
Simple Evaluation Iteration 3/20, Accuracy: 0.6000
Simple Evaluation Iteration 4/20, Accuracy: 0.5050
Simple Evaluation Iteration 5/20, Accuracy: 0.6500
Simple Evaluation Iteration 6/20, Accuracy: 0.5300
Simple Evaluation Iteration 7/20, Accuracy: 0.6550
Simple Evaluation Iteration 8/20, Accuracy: 0.7250
Simple Evaluation Iteration 9/20, Accuracy: 0.6650
Simple Evaluation Iteration 10/20, Accuracy: 0.7250
Simple Evaluation Iteration 11/20, Accuracy: 0.7250
Simple Evaluation Iteration 12/20, Accuracy: 0.7200
Simple Evaluation Iteration 13/20, Accuracy: 0.7300
Simple Evaluation Iteration 14/20, Accuracy: 0.7300
Simple Evaluation Iteration 15/20, Accuracy: 0.6800
Simple Evaluation Iteration 16/20, Accuracy: 0.7350
Simple Evaluation Iteration 17/20, Accuracy: 0.7250
Simple Evaluation Iteration 18/20, Accuracy: 0.7250
Simple Evaluation Iteration 19/20, Accuracy: 0.7100
Simple Evaluation Iteration 20/20, Accuracy: 0.7200

Okay but what does this mean? What does it entail and what can we deduce from it? Does one perform better than the other one or do they all perform equally as well?

The next step is visualizing this, first I want you to try this out for yourself. In scikit-activeml there are visualisation methods (skactiveml.visualization) and these can help together with plt & animation to plot.

8. Make a visualization for each of the strategies. Showcasing the decision boundary after acquiring x amount of labels. BONUS: Make it so that it can be animated and it animates over 10 labels you acquired

In [6]:
%matplotlib ipympl

from matplotlib import pyplot as plt, animation
from skactiveml.pool import MonteCarloEER, QueryByCommittee
from IPython.display import HTML


# Initialise the classifier.
clf = ParzenWindowClassifier(classes=[0, 1], random_state=random_state)

# Initialise the query strategy.
qs = MonteCarloEER(method='misclassification_loss')
# qs = ProbabilisticAL()
# qs = UncertaintySampling(method='entropy', random_state=42)
# qs = QueryByCommittee(method='KL_divergence', sample_predictions_method_name='sample_proba', sample_predictions_dict={'n_samples': 100})

if isinstance(qs, ProbabilisticAL):
    dens_est = clf.fit(X, np.zeros(len(X)))
    dens = clf.predict_freq(X)[:, 0]

# Preparation for plotting.
fig, ax = plt.subplots()
feature_bound = [[min(X[:, 0]), min(X[:, 1])], [max(X[:, 0]), max(X[:, 1])]]
artists = []

# Active learning cycle:
n_cycles = 10
for c in range(n_cycles):
    # Fit the classifier with current labels.
    clf.fit(X, y)

    # Query the next sample(s).
    if isinstance(qs, ProbabilisticAL):
        query_idx = qs.query(X=X, y=y, clf=clf, utility_weight=dens)
    elif isinstance(qs,MonteCarloEER):
        query_idx = qs.query(X=X, y=y, clf=clf, ignore_partial_fit=True)
    elif isinstance(qs, UncertaintySampling):
        query_idx = qs.query(X=X, y=y, clf=clf)
    elif isinstance(qs,QueryByCommittee):
        query_idx = qs.query(X=X, y=y, ensemble=clf)

    # Capture the current plot state.
    coll_old = list(ax.collections)
    title = ax.text(
        0.5, 1.05,
        f"Decision boundary after acquiring {c} labels with {qs.__class__.__name__}",
        size=plt.rcParams["axes.titlesize"],
        ha="center", transform=ax.transAxes,
    )

    # Update plot with utility values, samples, and decision boundary.
    X_labeled = X[labeled_indices(y)]
    if isinstance(qs, ProbabilisticAL):
        ax = plot_utilities(
        qs,
        X=X, y=y, clf=clf, utility_weight=dens,
        candidates=None,
        res=25,
        feature_bound=feature_bound,
        ax=ax,
    )
    elif isinstance(qs,MonteCarloEER):
        ax = plot_utilities(
        qs,
        X=X, y=y, clf=clf, ignore_partial_fit=True,
        candidates=None,
        res=25,
        feature_bound=feature_bound,
        ax=ax,
        )
    elif isinstance(qs, UncertaintySampling):
        ax = plot_utilities(
        qs,
        X=X, y=y, clf=clf,
        candidates=None,
        res=25,
        feature_bound=feature_bound,
        ax=ax,
        )
    elif isinstance(qs,QueryByCommittee):
        ax = plot_utilities(
        qs,
        X=X, y=y, ensemble=clf,
        candidates=None,
        res=25,
        feature_bound=feature_bound,
        ax=ax,
        )
    
    ax.scatter(
        X[:, 0], X[:, 1], c=y_true, cmap="coolwarm", marker=".", zorder=2
    )
    ax.scatter(
        X_labeled[:, 0],
        X_labeled[:, 1],
        c="grey",
        alpha=0.8,
        marker=".",
        s=300,
    )
    ax = plot_decision_boundary(clf, feature_bound, ax=ax)

    coll_new = list(ax.collections)
    coll_new.append(title)
    artists.append([x for x in coll_new if x not in coll_old])

    # Update labels based on query.
    y[query_idx] = y_true[query_idx]

ani = animation.ArtistAnimation(fig, artists, interval=1000, blit=True)
# HTML(ani.to_jshtml())
Figure