CS229 Notes3 SVM 实现

实现效果

Kernel_FeatureMaps.py

import numpy as np


def gram_schmidt_columns(X):
    Q, R = np.linalg.qr(X)
    return Q


def orthgonalize(V):
    N = V.shape[0]
    d = V.shape[1]
    turns = int(N / d)
    remainder = N % d

    V_ = np.zeros_like(V)

    for i in range(turns):
        v = gram_schmidt_columns(V[i * d:(i + 1) * d, :].T).T
        V_[i * d:(i + 1) * d, :] = v
    if remainder != 0:
        V_[turns * d:, :] = gram_schmidt_columns(V[turns * d:, :].T).T

    return V_


# Generate orthogonal normal weights (w1, ..., wm)
def generateGMatrix(m, d) -> np.array:
    G = np.random.normal(0, 1, (m, d))
    # Renormalize
    norms = np.linalg.norm(G, axis=1).reshape([m, 1])
    return orthgonalize(G) * norms


# Softmax trignometric feature map Φ(x)
def baseline_SM(x, G):
    """Calculate the result of softmax trigonometrix random feature mapping
        Parameters
        ----------
        x: array, dimension = n*d
            Input to the baseline mapping
            Required to be of norm 1 for i=1,...n
        
        G: matrix, dimension = m*d
            The matrix in the baseline random feature mapping
    """
    m = G.shape[0]
    left = np.cos(np.dot(x, G.T).astype(np.float32))
    right = np.sin(np.dot(x, G.T).astype(np.float32))
    return np.exp(0.5) * ((1 / m)**0.5) * np.hstack([left, right])


class Kernel(object):
    """Kernels.
        Parameters
        ---------
        x, y: array of (n_x, d), (n_y, d)
        Return
        ---------
        K(x,y): kernel matrix of (n_x, n_y), K_ij = K(x_i, y_j)
    """
    @staticmethod
    def Linear(x, y):
        return np.dot(x, y.T)

    @staticmethod
    def Softmax(x, y):
        return np.exp(np.dot(x, y.T))


class FeatureMap(object):
    """Feature mapping
    
        Parameters
        ---------
        x: array of (n, d)
        Return
        ---------
        Φ(x): array of (n, m), where m is usually a higher dimensionality. Here we set m = 2*n
    """
    @staticmethod
    def Linear(x):
        return x

    @staticmethod
    def Softmax_Trigonometric(x):
        n, d = x.shape
        # Increase dimensionality to 2x
        G = generateGMatrix(2 * d, d)
        phi_x = baseline_SM(x, G)
        return phi_x

SVM.py

import numpy as np
from scipy.optimize import minimize
from tqdm.notebook import tqdm
from Kernel_FeatureMaps import *;

class SVM(object):
    """ 
    ## Author:
        Hua Yao (hy2632@columbia.edu)
    ## Description:
        SVM binary classifier, optimizing with the dual lagrangian program, trained on a sample batch. Uses the SMO algorithm.
        Normalizes input X to adapt to kernelized version.
        Regularization parameter C set to be `np.inf` for easy closed form solution of b.
    ## Reference:
        [CS229 - Kernel Methods and SVM](http://cs229.stanford.edu/notes2020fall/notes2020fall/cs229-notes3.pdf)
    ---
        Parameters:
        ---------
        X: (N, d)
        Y: (N,)
        kernel: kernel used, linear/softmax
        featuremap: feature mapping corresponding to the kernel used
        batch_size: int, also denoted as n
        C: l1 regularization term for soft margin. alpha_i in [0, C]. Set as np.inf (no regularization), because the form of b is nasty under regularization.
        tol = 1e-6: tolerance, deciding when to end training
        Intermediate parameters:
        ---------
        x: (n, d), random batch of X
        y: (n)
        phi_x: (n, m), feature map(s) of x
        M: (n, n), M[i,j] = y_iy_j * K(x_i,x_j), hadamard product of y^Ty and K
        Learned parameters:
        ---------
        alpha: (n,)
        w: (d,)
        b: int
    """
    def __init__(self,
                 X,
                 Y,
                 kernel=Kernel.Linear,
                 featuremap=FeatureMap.Linear,
                 batch_size=64,
                 C=np.inf,
                 tol=1e-6):
        # C set as np.inf here -- no regularization

        # X: (N,d), Y: (N,), x: (n,d), y:(n,)
        # Fixed values
        self.N, self.d = X.shape
        # Normalize data
        self.X = X
        self.X = self.X / np.linalg.norm(self.X, axis=1, keepdims=True)
        self.Y = Y
        self.kernel = kernel
        self.featuremap = featuremap
        self.n = batch_size
        self.C = C
        self.tol = tol

        batch_indices = np.random.choice(np.arange(self.N), self.n)
        self.x = self.X[batch_indices]
        self.y = self.Y[batch_indices]
        self.phi_x = self.featuremap(self.x)
        self.M = np.outer(self.y, self.y) * self.kernel(self.x, self.x)

        # Learned parameters
        self.alpha = np.ones(self.n)
        self.w = np.zeros(self.d)
        self.b = 0

    def update_alpha(self, random_idx1, random_idx2):
        Zeta = -np.sum(self.alpha * self.y) + (
            self.alpha * self.y)[random_idx1] + (self.alpha *
                                                 self.y)[random_idx2]
        self.alpha[random_idx1] = (Zeta - self.alpha[random_idx2] *
                                   self.y[random_idx2]) * self.y[random_idx1]

    def dual_obj(self, alpha):
        return np.sum(alpha) - np.sum(0.5 * self.M * np.outer(alpha, alpha))

    def fit(self, iterations=200000):
        prev_val = self.alpha.copy()

        for i in tqdm(range(iterations)):
            # Select 2 alphas randomly
            random_idx1, random_idx2 = np.random.choice(
                np.arange(0, self.n), 2)

            # The (quadratic w.r.t a2) function that scipy.optimize.minimize takes
            def optimizeWRTa2(a2):
                self.alpha[random_idx2] = a2
                self.update_alpha(random_idx1, random_idx2)
                return -self.dual_obj(self.alpha)

            # Solve optimization w.r.t a2
            a2 = self.alpha[random_idx2]
            res = minimize(optimizeWRTa2, a2, bounds=[(0, self.C)])
            a2 = res.x

            # Update a2
            self.alpha[random_idx2] = a2
            self.update_alpha(random_idx1, random_idx2)

            # Check convergence
            if (i % 5 == 1):
                if np.sum(np.abs(self.alpha - prev_val)) < self.tol:
                    print(
                        f">> Optimized on the batch, step {i}. 5 steps Δalpha:{np.sum(np.abs(self.alpha - prev_val))}"
                    )
                    return
                else:
                    if (i % 5000 == 1):
                        print(
                            f">> Optimizing, step {i}. Δalpha:{np.sum(np.abs(self.alpha - prev_val))}"
                        )
                    prev_val = self.alpha.copy()

        # Retrieve w and b
        self.w = np.dot(self.alpha * self.y, self.phi_x)
        # The form of b changes when there exists Regularization C. So we simply cancel C here.
        # Look at P25 of CS229 - Note 3.
        # Form of b under regularization depends on alpha. (http://cs229.stanford.edu/materials/smo.pdf)
        # [Bias Term b in SVMs Again](https://www.elen.ucl.ac.be/Proceedings/esann/esannpdf/es2004-11.pdf) gives a general representation
        self.b = (np.max(np.dot(self.phi_x, self.w)[self.y == -1]) +
                  np.min(np.dot(self.phi_x, self.w)[self.y == 1])) * -0.5

    def predict(self, X_val):
        X_val_normed = X_val / np.linalg.norm(X_val, axis=1, keepdims=True)
        return np.sign(
            np.dot(self.kernel(X_val_normed, self.x), self.alpha * self.y) +
            self.b)

    def score(self, X_val, y_val):
        prediction = self.predict(X_val)
        return np.mean(prediction == y_val)