Data-Mining-Recitation1

发表于 2020-10-05 分类于 Data Mining Disqus：

Recitation1:

主要讨论Random Feature map 对高斯核的模拟，以及G正交化的影响

P.S. hexo部署时添加图片 Adding Images to Hexo Posts with Markdown npm i -s hexo-asset-link

jupyter导出pdf 安装TeX

Import packages

import numpy as np
import math
from sklearn.metrics import mean_squared_error 
import matplotlib.pyplot as plt
import os
import random
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import datetime
from scipy.stats import norm

Calculate Gram-Schmidt Orthogonalization

def gram_schmidt_columns(X):
    Q, R = np.linalg.qr(X)
    return Q

def orthgonalize(V):
    N = V.shape[0]
    d = V.shape[1]
    turns = int(N/d)
    remainder = N%d
    
    #V =  np.random.normal(size=[(turns+1)*d, d])
    #V =  np.random.normal(size =[N, d])
    #print(V.shape)
    V_ = np.zeros_like(V)
    
    for i in range(turns):
        v = gram_schmidt_columns(V[i*d:(i+1)*d, :].T).T
        V_[i*d:(i+1)*d, :] = v
    if remainder != 0:
        V_[turns*d:,:] = gram_schmidt_columns(V[turns*d:,:].T).T
        
    return V_

Step 1: i.i.d. sampling

1 2	random_matrix = np.random.normal(0, 1, size = (6,3)) random_matrix

array([[ 1.59895494,  0.29885485,  0.43694542],
       [-0.43523095, -0.10324217, -1.28703438],
       [-0.7452128 , -0.12095329,  0.31481384],
       [ 0.0779075 ,  0.43587534, -0.04371553],
       [ 0.35963102,  0.90466958, -1.25109187],
       [ 1.58862906, -0.08350228, -0.51967566]])

Step 2: Calculate the norm of each sampling for later renormalization

1 2	norms = np.linalg.norm(random_matrix, axis = 1).reshape(6,1) norms

array([[1.6843077 ],
       [1.36254996],
       [0.81797284],
       [0.44493588],
       [1.58524206],
       [1.67355242]])

Step 3: Orthogonalize the random features

1	a = orthgonalize(random_matrix)

1	a, a.shape

(array([[-0.94932472, -0.17743483, -0.25942137],
        [-0.25846666, -0.02888627,  0.9655882 ],
        [-0.17882269,  0.98370853, -0.01843854],
        [-0.17509826, -0.9796363 ,  0.09825129],
        [ 0.14721431, -0.12472189, -0.98120966],
        [ 0.97348269, -0.15734411,  0.16605506]]), (6, 3))

Step 4: Renormalize the orthogonalized random features

a*norms

array([[-1.59895494, -0.29885485, -0.43694542],
       [-0.35217373, -0.03935899,  1.31566217],
       [-0.14627211,  0.80464686, -0.01508222],
       [-0.0779075 , -0.43587534,  0.04371553],
       [ 0.23337031, -0.19771438, -1.55545483],
       [ 1.62917431, -0.26332362,  0.27790185]])

1	np.dot(a[0, :], a[1, :])

5.551115123125783e-17

# G就是(w1, ..., wm)
# 高斯核的phi(x)
def baseline_eval_gaussian(x, G, m):
    """Calculate the result of baseline random feature mapping

        Parameters
        ----------
        x: array, dimension = d
            The data point to input to the baseline mapping
        
        G: matrix, dimension = m*d
            The matrix in the baseline random feature mapping
            
        m: integer
            The number of dimension that we want to reduce to
    """
    left = np.cos(np.dot(G, x).astype(np.float32))
    right = np.sin(np.dot(G, x).astype(np.float32))
    return ((1/m)**0.5) * np.append(left, right)

def find_sigma(random_sample):
    all_distances = []
    for i in range(len(random_sample)): # dimensionality: d
        #print(f'Calculating the distance of {i}th samples')
        distances = []
        for j in range(len(random_sample)):
            if j!=i:
                distances.append(np.linalg.norm(random_sample[i] - random_sample[j]))
        distances.sort()
        all_distances.append(distances[50]) # 只取第50个
        # all_distances.append(np.mean(distances))
    return np.mean(all_distances)

1	find_sigma(np.random.normal(0, 1, (100,50)))

10.11949067744566

Main

Load dataset

1 2	from google.colab import drive drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

1	%cd "/content/drive/My Drive/20FA/Recitation"

/content/drive/My Drive/20FA/Recitation

1 2	d = 10 # Dimension of data d_ = 10 # Number of multipliers

1 2	letters = pd.read_csv('letter-recognition.csv') letters.head()

	letter	xbox	ybox	width	height	onpix	xbar	ybar	x2bar	y2bar	xybar	x2ybar	xy2bar	xedge	xedgey	yedge	yedgex
0	T	2	8	3	5	1	8	13	0	6	6	10	8	0	8	0	8
1	I	5	12	3	7	2	10	5	5	4	13	3	9	2	8	4	10
2	D	4	11	6	8	6	10	6	2	6	10	3	7	3	7	3	9
3	N	7	11	6	6	3	5	9	4	6	4	4	10	6	10	2	8
4	G	2	1	3	1	1	8	6	6	6	6	5	9	1	7	5	10

1
2
3

letters = np.asarray(letters)
data = letters[:, 1:d+1]
data = list(data)

data[0]

array([2, 8, 3, 5, 1, 8, 13, 0, 6, 6], dtype=object)

Parameters setup

#d = 10  # dimension of the data point 
N = [d*i for i in range(1,d_+1)]  # number of samplings
episode = 100
epoch =  450  # number of experiments to perform for each episode

[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

Start running

# 正交化可以降低variance
# 对不同样本大小测试

iid_estimates = np.zeros((d_, episode, epoch))
orthog_estimates = np.zeros((d_, episode, epoch))

MSE_iid = []
MSE_orthog = []

MSE_iid_ = []
MSE_orthog_ = []

list_of_samples = []
true_values = []

random_sample = random.sample(data, 1000)
sigma = find_sigma(random_sample)

random_sample = [i/sigma for i in random_sample]

sigma

5.760360599001658

# 取出20000个数据点中的1000个作为random_sample
# find_sigma从每行中取第50个，整体做平均，可以省去一个mean的运算
# random_sample的每行数据点除以标准差，归一化
random_sample[0], len(random_sample)

(array([0.34720048608530246, 0.17360024304265123, 0.34720048608530246,
        0.34720048608530246, 0.17360024304265123, 0.8680012152132562,
        1.7360024304265125, 0.6944009721706049, 0.8680012152132562,
        1.7360024304265125], dtype=object), 1000)

1
2
3

# [tqdm](https://tqdm.github.io/)
for i in tqdm(range(10000000)):
    pass

  0%|          | 0/10000000 [00:00<?, ?it/s] [A
  3%|▎         | 318248/10000000 [00:00<00:03, 3182475.45it/s] [A
  6%|▋         | 639853/10000000 [00:00<00:02, 3192472.62it/s] [A
 10%|▉         | 960927/10000000 [00:00<00:02, 3197929.60it/s] [A
 13%|█▎        | 1320869/10000000 [00:00<00:02, 3308643.11it/s] [A
 17%|█▋        | 1694130/10000000 [00:00<00:02, 3425362.30it/s] [A
 21%|██        | 2086242/10000000 [00:00<00:02, 3560404.17it/s] [A
 25%|██▍       | 2491758/10000000 [00:00<00:02, 3695671.55it/s] [A
 29%|██▉       | 2901121/10000000 [00:00<00:01, 3806689.20it/s] [A
 33%|███▎      | 3300819/10000000 [00:00<00:01, 3861844.78it/s] [A
 37%|███▋      | 3714063/10000000 [00:01<00:01, 3939226.23it/s] [A
 41%|████▏     | 4127830/10000000 [00:01<00:01, 3996729.87it/s] [A
 45%|████▌     | 4548321/10000000 [00:01<00:01, 4056985.18it/s] [A
 50%|████▉     | 4974120/10000000 [00:01<00:01, 4115263.66it/s] [A
 54%|█████▍    | 5401755/10000000 [00:01<00:01, 4162298.98it/s] [A
 58%|█████▊    | 5841688/10000000 [00:01<00:00, 4230680.58it/s] [A
 63%|██████▎   | 6272699/10000000 [00:01<00:00, 4254198.50it/s] [A
 67%|██████▋   | 6700434/10000000 [00:01<00:00, 4261115.77it/s] [A
 71%|███████▏  | 7141608/10000000 [00:01<00:00, 4305207.12it/s] [A
 76%|███████▌  | 7594108/10000000 [00:01<00:00, 4368868.06it/s] [A
 80%|████████  | 8036809/10000000 [00:02<00:00, 4386147.78it/s] [A
 85%|████████▍ | 8484505/10000000 [00:02<00:00, 4413000.38it/s] [A
 89%|████████▉ | 8925941/10000000 [00:02<00:00, 4386127.92it/s] [A
 94%|█████████▍| 9377578/10000000 [00:02<00:00, 4424403.07it/s] [A
100%|██████████| 10000000/10000000 [00:02<00:00, 4092170.78it/s]

for e in tqdm(range(episode), position=0, leave=True): # episode = 100

    x, y = random.sample(random_sample, k=2)

    true_value = math.exp((-1)*(np.linalg.norm(x - y)**2)/(2)) #K_Gauss(x, y) = e^{(x-y)^2/2}
    list_of_samples.append((x, y)) # 1000个值，与true_value对应
    true_values.append(true_value)

for n in range(len(N)): # 10组样本，sample_size10到100

    mse_iid_n = []
    mse_orthog_n = []

    for e in tqdm(range(episode), position=0, leave=True):
        #print(f'{N[n]} samplings, {e}th episode')
        true = np.repeat(true_values[e], epoch) # epoch=450
        
        x = list_of_samples[e][0]
        y = list_of_samples[e][1]
        
        for i in range(epoch):

            np.random.seed()
            # N[n]即m的值变化，aka V=[w1, ..., wm]变化
            V = np.random.normal(0, 1, (N[n],d))   #create N*d iid matrix
            V_orthog = orthgonalize(V)   #create N*d matrix with orthogonal blocks
            norms = np.linalg.norm(V, axis=1).reshape([N[n],1])
            
            # K(x, y) = Φ(x)^T Φ(y)
            # Φ(x) = 1/sqrt(m) * [cos(w1x), ... cos(wmx), sin(w1x), ..., sin(wmx)]
            iid_estimates[n, e, i] = np.dot(baseline_eval_gaussian(x, V, N[n]), # Original "G"
                        baseline_eval_gaussian(y, V, N[n]))

            orthog_estimates[n, e, i] = np.dot(baseline_eval_gaussian(x, V_orthog*norms, N[n]), # Orthogonalized "G"
                        baseline_eval_gaussian(y, V_orthog*norms, N[n]))
            
        mse_iid_n.append(mean_squared_error(true, iid_estimates[n, e])) # compare m values
        mse_orthog_n.append(mean_squared_error(true, orthog_estimates[n, e]))
    
    MSE_iid_.append(mse_iid_n)
    MSE_iid.append(np.mean(mse_iid_n))
    
    MSE_orthog_.append(mse_orthog_n)
    MSE_orthog.append(np.mean(mse_orthog_n))

100%|██████████| 100/100 [00:00<00:00, 41614.29it/s]
100%|██████████| 100/100 [00:21<00:00,  4.61it/s]
100%|██████████| 100/100 [00:26<00:00,  3.74it/s]
100%|██████████| 100/100 [00:31<00:00,  3.16it/s]
100%|██████████| 100/100 [00:35<00:00,  2.78it/s]
100%|██████████| 100/100 [00:40<00:00,  2.47it/s]
100%|██████████| 100/100 [00:44<00:00,  2.23it/s]
100%|██████████| 100/100 [00:49<00:00,  2.00it/s]
100%|██████████| 100/100 [00:54<00:00,  1.84it/s]
100%|██████████| 100/100 [00:58<00:00,  1.72it/s]
100%|██████████| 100/100 [01:02<00:00,  1.60it/s]

Visualization

sns.set_style("darkgrid")    

x1 = range(1, d_+1)
x1 = x1 + np.zeros((episode, d_))
x1 = np.sort(x1.reshape(episode*d_,)) 

mse_iid_ = np.asarray(MSE_iid_).reshape(-1)
category = ['MC' for i in range(episode*d_)]
df1 = pd.DataFrame({'D/d': x1, 'MSE':mse_iid_, 'Category':category})

mse_ortho_ = np.asarray(MSE_orthog_).reshape(-1)
category = ['OMC' for i in range(episode*d_)]
df3 = pd.DataFrame({'D/d': x1, 'MSE':mse_ortho_, 'Category':category})

df = pd.concat([df1, df3], ignore_index=True)

sns.lineplot(x="D/d", y="MSE", hue="Category",
                   err_style="bars", ci=95, data=df)
plt.title('Gaussian_kernel_Comparison')
plt.savefig(f'Gaussian_kernel_Comparison_{datetime.datetime.now()}.png', dpi = 300)
plt.show()

1	# !sudo apt-get install texlive-xetex texlive-fonts-recommended texlive-generic-recommended

1	!jupyter nbconvert --to PDF DM_recitation.ipynb

[NbConvertApp] Converting notebook DM_recitation.ipynb to PDF
[NbConvertApp] Support files will be in DM_recitation_files/
[NbConvertApp] Making directory ./DM_recitation_files
[NbConvertApp] Writing 76544 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: [u'xelatex', u'./notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: [u'bibtex', u'./notebook']
[NbConvertApp] WARNING | bibtex had problems, most likely because there were no citations
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 75123 bytes to DM_recitation.pdf