Data-Mining-Recitation1

Recitation1:

主要讨论Random Feature map 对高斯核的模拟,以及G正交化的影响

P.S. hexo部署时添加图片 Adding Images to Hexo Posts with Markdown npm i -s hexo-asset-link

jupyter导出pdf 安装TeX

Import packages

1
2
3
4
5
6
7
8
9
10
11
import numpy as np
import math
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import os
import random
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import datetime
from scipy.stats import norm

Calculate Gram-Schmidt Orthogonalization

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def gram_schmidt_columns(X):
Q, R = np.linalg.qr(X)
return Q

def orthgonalize(V):
N = V.shape[0]
d = V.shape[1]
turns = int(N/d)
remainder = N%d

#V = np.random.normal(size=[(turns+1)*d, d])
#V = np.random.normal(size =[N, d])
#print(V.shape)
V_ = np.zeros_like(V)

for i in range(turns):
v = gram_schmidt_columns(V[i*d:(i+1)*d, :].T).T
V_[i*d:(i+1)*d, :] = v
if remainder != 0:
V_[turns*d:,:] = gram_schmidt_columns(V[turns*d:,:].T).T

return V_
Step 1: i.i.d. sampling
1
2
random_matrix = np.random.normal(0, 1, size = (6,3))
random_matrix
array([[ 1.59895494,  0.29885485,  0.43694542],
       [-0.43523095, -0.10324217, -1.28703438],
       [-0.7452128 , -0.12095329,  0.31481384],
       [ 0.0779075 ,  0.43587534, -0.04371553],
       [ 0.35963102,  0.90466958, -1.25109187],
       [ 1.58862906, -0.08350228, -0.51967566]])
Step 2: Calculate the norm of each sampling for later renormalization
1
2
norms = np.linalg.norm(random_matrix, axis = 1).reshape(6,1)
norms
array([[1.6843077 ],
       [1.36254996],
       [0.81797284],
       [0.44493588],
       [1.58524206],
       [1.67355242]])
Step 3: Orthogonalize the random features
1
a = orthgonalize(random_matrix)
1
a, a.shape
(array([[-0.94932472, -0.17743483, -0.25942137],
        [-0.25846666, -0.02888627,  0.9655882 ],
        [-0.17882269,  0.98370853, -0.01843854],
        [-0.17509826, -0.9796363 ,  0.09825129],
        [ 0.14721431, -0.12472189, -0.98120966],
        [ 0.97348269, -0.15734411,  0.16605506]]), (6, 3))
Step 4: Renormalize the orthogonalized random features
1
a*norms
array([[-1.59895494, -0.29885485, -0.43694542],
       [-0.35217373, -0.03935899,  1.31566217],
       [-0.14627211,  0.80464686, -0.01508222],
       [-0.0779075 , -0.43587534,  0.04371553],
       [ 0.23337031, -0.19771438, -1.55545483],
       [ 1.62917431, -0.26332362,  0.27790185]])
1
np.dot(a[0, :], a[1, :])
5.551115123125783e-17
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# G就是(w1, ..., wm)
# 高斯核的phi(x)
def baseline_eval_gaussian(x, G, m):
"""Calculate the result of baseline random feature mapping

Parameters
----------
x: array, dimension = d
The data point to input to the baseline mapping

G: matrix, dimension = m*d
The matrix in the baseline random feature mapping

m: integer
The number of dimension that we want to reduce to
"""
left = np.cos(np.dot(G, x).astype(np.float32))
right = np.sin(np.dot(G, x).astype(np.float32))
return ((1/m)**0.5) * np.append(left, right)

def find_sigma(random_sample):
all_distances = []
for i in range(len(random_sample)): # dimensionality: d
#print(f'Calculating the distance of {i}th samples')
distances = []
for j in range(len(random_sample)):
if j!=i:
distances.append(np.linalg.norm(random_sample[i] - random_sample[j]))
distances.sort()
all_distances.append(distances[50]) # 只取第50
# all_distances.append(np.mean(distances))
return np.mean(all_distances)
1
find_sigma(np.random.normal(0, 1, (100,50)))
10.11949067744566

Main

Load dataset
1
2
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
1
%cd "/content/drive/My Drive/20FA/Recitation"
/content/drive/My Drive/20FA/Recitation
1
2
d = 10  # Dimension of data
d_ = 10 # Number of multipliers
1
2
letters = pd.read_csv('letter-recognition.csv')
letters.head()
letter xbox ybox width height onpix xbar ybar x2bar y2bar xybar x2ybar xy2bar xedge xedgey yedge yedgex
0 T 2 8 3 5 1 8 13 0 6 6 10 8 0 8 0 8
1 I 5 12 3 7 2 10 5 5 4 13 3 9 2 8 4 10
2 D 4 11 6 8 6 10 6 2 6 10 3 7 3 7 3 9
3 N 7 11 6 6 3 5 9 4 6 4 4 10 6 10 2 8
4 G 2 1 3 1 1 8 6 6 6 6 5 9 1 7 5 10
1
2
3
letters = np.asarray(letters)
data = letters[:, 1:d+1]
data = list(data)
1
data[0]
array([2, 8, 3, 5, 1, 8, 13, 0, 6, 6], dtype=object)
Parameters setup
1
2
3
4
#d = 10  # dimension of the data point 
N = [d*i for i in range(1,d_+1)] # number of samplings
episode = 100
epoch = 450 # number of experiments to perform for each episode
1
N
[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
Start running
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 正交化可以降低variance
# 对不同样本大小测试

iid_estimates = np.zeros((d_, episode, epoch))
orthog_estimates = np.zeros((d_, episode, epoch))

MSE_iid = []
MSE_orthog = []

MSE_iid_ = []
MSE_orthog_ = []

list_of_samples = []
true_values = []

random_sample = random.sample(data, 1000)
sigma = find_sigma(random_sample)

random_sample = [i/sigma for i in random_sample]
1
sigma
5.760360599001658
1
2
3
4
# 取出20000个数据点中的1000个作为random_sample
# find_sigma从每行中取第50个,整体做平均,可以省去一个mean的运算
# random_sample的每行数据点除以标准差,归一化
random_sample[0], len(random_sample)
(array([0.34720048608530246, 0.17360024304265123, 0.34720048608530246,
        0.34720048608530246, 0.17360024304265123, 0.8680012152132562,
        1.7360024304265125, 0.6944009721706049, 0.8680012152132562,
        1.7360024304265125], dtype=object), 1000)
1
2
3
# [tqdm](https://tqdm.github.io/)
for i in tqdm(range(10000000)):
pass
  0%|          | 0/10000000 [00:00<?, ?it/s] [A
  3%|▎         | 318248/10000000 [00:00<00:03, 3182475.45it/s] [A
  6%|▋         | 639853/10000000 [00:00<00:02, 3192472.62it/s] [A
 10%|▉         | 960927/10000000 [00:00<00:02, 3197929.60it/s] [A
 13%|█▎        | 1320869/10000000 [00:00<00:02, 3308643.11it/s] [A
 17%|█▋        | 1694130/10000000 [00:00<00:02, 3425362.30it/s] [A
 21%|██        | 2086242/10000000 [00:00<00:02, 3560404.17it/s] [A
 25%|██▍       | 2491758/10000000 [00:00<00:02, 3695671.55it/s] [A
 29%|██▉       | 2901121/10000000 [00:00<00:01, 3806689.20it/s] [A
 33%|███▎      | 3300819/10000000 [00:00<00:01, 3861844.78it/s] [A
 37%|███▋      | 3714063/10000000 [00:01<00:01, 3939226.23it/s] [A
 41%|████▏     | 4127830/10000000 [00:01<00:01, 3996729.87it/s] [A
 45%|████▌     | 4548321/10000000 [00:01<00:01, 4056985.18it/s] [A
 50%|████▉     | 4974120/10000000 [00:01<00:01, 4115263.66it/s] [A
 54%|█████▍    | 5401755/10000000 [00:01<00:01, 4162298.98it/s] [A
 58%|█████▊    | 5841688/10000000 [00:01<00:00, 4230680.58it/s] [A
 63%|██████▎   | 6272699/10000000 [00:01<00:00, 4254198.50it/s] [A
 67%|██████▋   | 6700434/10000000 [00:01<00:00, 4261115.77it/s] [A
 71%|███████▏  | 7141608/10000000 [00:01<00:00, 4305207.12it/s] [A
 76%|███████▌  | 7594108/10000000 [00:01<00:00, 4368868.06it/s] [A
 80%|████████  | 8036809/10000000 [00:02<00:00, 4386147.78it/s] [A
 85%|████████▍ | 8484505/10000000 [00:02<00:00, 4413000.38it/s] [A
 89%|████████▉ | 8925941/10000000 [00:02<00:00, 4386127.92it/s] [A
 94%|█████████▍| 9377578/10000000 [00:02<00:00, 4424403.07it/s] [A
100%|██████████| 10000000/10000000 [00:02<00:00, 4092170.78it/s]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
for e in tqdm(range(episode), position=0, leave=True): # episode = 100

x, y = random.sample(random_sample, k=2)

true_value = math.exp((-1)*(np.linalg.norm(x - y)**2)/(2)) #K_Gauss(x, y) = e^{(x-y)^2/2}
list_of_samples.append((x, y)) # 1000个值,与true_value对应
true_values.append(true_value)

for n in range(len(N)): # 10组样本,sample_size10到100

mse_iid_n = []
mse_orthog_n = []

for e in tqdm(range(episode), position=0, leave=True):
#print(f'{N[n]} samplings, {e}th episode')
true = np.repeat(true_values[e], epoch) # epoch=450

x = list_of_samples[e][0]
y = list_of_samples[e][1]

for i in range(epoch):

np.random.seed()
# N[n]即m的值变化,aka V=[w1, ..., wm]变化
V = np.random.normal(0, 1, (N[n],d)) #create N*d iid matrix
V_orthog = orthgonalize(V) #create N*d matrix with orthogonal blocks
norms = np.linalg.norm(V, axis=1).reshape([N[n],1])

# K(x, y) = Φ(x)^T Φ(y)
# Φ(x) = 1/sqrt(m) * [cos(w1x), ... cos(wmx), sin(w1x), ..., sin(wmx)]
iid_estimates[n, e, i] = np.dot(baseline_eval_gaussian(x, V, N[n]), # Original "G"
baseline_eval_gaussian(y, V, N[n]))

orthog_estimates[n, e, i] = np.dot(baseline_eval_gaussian(x, V_orthog*norms, N[n]), # Orthogonalized "G"
baseline_eval_gaussian(y, V_orthog*norms, N[n]))

mse_iid_n.append(mean_squared_error(true, iid_estimates[n, e])) # compare m values
mse_orthog_n.append(mean_squared_error(true, orthog_estimates[n, e]))

MSE_iid_.append(mse_iid_n)
MSE_iid.append(np.mean(mse_iid_n))

MSE_orthog_.append(mse_orthog_n)
MSE_orthog.append(np.mean(mse_orthog_n))
100%|██████████| 100/100 [00:00<00:00, 41614.29it/s]
100%|██████████| 100/100 [00:21<00:00,  4.61it/s]
100%|██████████| 100/100 [00:26<00:00,  3.74it/s]
100%|██████████| 100/100 [00:31<00:00,  3.16it/s]
100%|██████████| 100/100 [00:35<00:00,  2.78it/s]
100%|██████████| 100/100 [00:40<00:00,  2.47it/s]
100%|██████████| 100/100 [00:44<00:00,  2.23it/s]
100%|██████████| 100/100 [00:49<00:00,  2.00it/s]
100%|██████████| 100/100 [00:54<00:00,  1.84it/s]
100%|██████████| 100/100 [00:58<00:00,  1.72it/s]
100%|██████████| 100/100 [01:02<00:00,  1.60it/s]
Visualization
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
sns.set_style("darkgrid")    

x1 = range(1, d_+1)
x1 = x1 + np.zeros((episode, d_))
x1 = np.sort(x1.reshape(episode*d_,))

mse_iid_ = np.asarray(MSE_iid_).reshape(-1)
category = ['MC' for i in range(episode*d_)]
df1 = pd.DataFrame({'D/d': x1, 'MSE':mse_iid_, 'Category':category})

mse_ortho_ = np.asarray(MSE_orthog_).reshape(-1)
category = ['OMC' for i in range(episode*d_)]
df3 = pd.DataFrame({'D/d': x1, 'MSE':mse_ortho_, 'Category':category})

df = pd.concat([df1, df3], ignore_index=True)

sns.lineplot(x="D/d", y="MSE", hue="Category",
err_style="bars", ci=95, data=df)
plt.title('Gaussian_kernel_Comparison')
plt.savefig(f'Gaussian_kernel_Comparison_{datetime.datetime.now()}.png', dpi = 300)
plt.show()

1
# !sudo apt-get install texlive-xetex texlive-fonts-recommended texlive-generic-recommended
1
!jupyter nbconvert --to PDF DM_recitation.ipynb
[NbConvertApp] Converting notebook DM_recitation.ipynb to PDF
[NbConvertApp] Support files will be in DM_recitation_files/
[NbConvertApp] Making directory ./DM_recitation_files
[NbConvertApp] Writing 76544 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: [u'xelatex', u'./notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: [u'bibtex', u'./notebook']
[NbConvertApp] WARNING | bibtex had problems, most likely because there were no citations
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 75123 bytes to DM_recitation.pdf
1