Python, MATLAB, Julia, R code: Chapter 9
Acknowledgement: The Julia code is written by the contributors listed here.
Chapter 9.1 Confidence Interval
Histogram of the sample average
% MATLAB code
gmm = gmdistribution([0; 5], cat(3,1,1), [0.3 0.7]);
x = linspace(-5, 10, 1000)';
f = pdf(gmm, x);
figure;
plot(x,f,'LineWidth', 6,'color',[0.1, 0.4, 0.6]);
grid on;
axis([-5 10 0 0.35]);
legend('Population', 'Location', 'NW');
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);
for i=1:4
figure;
bin = -5:10;
Y = random(gmm, 50);
[num,val] = hist(Y,bin);
bar(val, num,'FaceColor',[0.1, 0.4, 0.6]);
tt = sprintf('Mean = %3.2f', mean(Y));
legend(tt, 'Location', 'NW');
grid on;
axis([-5 10 0 20]);
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);
end
bin = 2:0.1:5;
for i=1:10000
Y(:,i) = random(gmm, 50);
end
M = mean(Y);
[num,val] = hist(M,bin);
bar(val, num,'FaceColor',[0.2, 0.2, 0.2]);
grid on;
axis([2 5 0 1200]);
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);
legend('Sample Average');
using Distributions, Plots, Printf
gmm = MixtureModel(Normal[Normal(), Normal(5., 1.),], [0.3, 0.7])
x = range(-5., 10., length=1000)
plot(x, pdf(gmm, x), linewidth=3, label=false, title="Population distribution")
plot_array = []
for i=1:4
bin = -5:10
Y = rand(gmm, 50)
tt = @sprintf("Mean = %3.2f", mean(Y))
push!(plot_array, histogram(Y, bins=bin, label = tt, legend=:topleft))
end
plot(plot_array...)
M = zeros(10000)
bin = range(2., 5., step=0.1)
for i = 1:10000
M[i] = mean(rand(gmm, 50))
end
histogram(M, bins=bin, label=false, title="Histogram of the sample average")
Compute confidence interval
Given , compute the width of the confidence interval:
% MATLAB code to compute the width of the confidence interval
alpha = 0.05;
mu = 0; sigma = 1; % Standard Gaussian
epsilon = icdf('norm',1-alpha/2,mu,sigma)
import scipy.stats as stats
alph = 0.05;
mu = 0; sigma = 1;
epsilon = stats.norm.ppf(1-alph/2, mu, sigma)
print(epsilon)
using Distributions
alpha = 0.05
mu = 0.
sigma = 1.
epsilon = quantile(Normal(mu, sigma ), 1. - alpha/2.)
alpha = 0.05
mu = 0
sigma = 1
epsilon = qnorm(0.975, mu, sigma)
epsilon
Visualize the -distribution
% MATLAB code to plot the t-distribution
x = linspace(-5,5,100);
p1 = pdf('norm',x,0,1);
p2 = pdf('t',x,11-1);
p3 = pdf('t',x,3-1);
p4 = pdf('t',x,2-1);
figure;
plot(x,p1,'-.','LineWidth',4,'Color',[0 0.6 0]); hold on;
plot(x,p2,'-o','LineWidth',2,'Color',[1 0.6 0.2]);
plot(x,p3,'-^','LineWidth',2,'Color',[0.2 0.2 0.7]);
plot(x,p4,'-s','LineWidth',2,'Color',[0.7 0.2 0.2]);
legend('Gaussian(0,1)', 't-dist, N = 11', 't-dist, N = 3', 't-dist, N = 2');
grid on;
xticks(-5:1:5);
yticks(0:0.05:0.4);
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);
using Distributions, Plots
x = range(-5.,5.,length=100)
p1 = pdf(Normal(),x)
p2 = pdf(TDist(11-1),x)
p3 = pdf(TDist(3-1),x)
p4 = pdf(TDist(2-1),x)
plot([p1,p2,p3,p4], label = ["Gaussian(0,1)" "t-dist, N = 11" "t-dist, N = 3" "t-dist, N = 2"], legend=:topright)
Construct a confidence interval from data
import numpy as np
import scipy.stats as stats
x = np.array([72, 69, 75, 58, 67, 70, 60, 71, 59, 65])
N = x.size
Theta_hat = np.mean(x)
S_hat = np.std(x)
nu = x.size-1
alpha = 0.05
z = stats.t.ppf(1-alph/2, nu)
CI_L = Theta_hat-z*S_hat/np.sqrt(N)
CI_U = Theta_hat+z*S_hat/np.sqrt(N)
print(CI_L, CI_U)
using Statistics, Distributions, Printf
x = [72., 69., 75., 58., 67., 70., 60., 71., 59., 65.]
thetahat = mean(x)
sigmahat = std(x)
N = size(x,1)
nu = N-1
alpha = 0.05
z = quantile(TDist(nu), 1. - alpha/2.)
CI_L = thetahat - z*sigmahat/sqrt(N)
CI_U = thetahat + z*sigmahat/sqrt(N)
@printf("Confidence interval: [%5.3f - %5.3f]", CI_L, CI_U)
x <- c(72,69,75,58,67,70,60,71,59,65)
N = length(x)
Theta_hat = mean(x)
S_hat = sd(x)
nu = length(x) - 1
alpha = 0.05
z = qt(1-alpha/2, nu)
CI_L = Theta_hat-z*S_hat/sqrt(N)
CI_U = Theta_hat+z*S_hat/sqrt(N)
CI_L
CI_U
Chapter 9.2 Bootstrap
Visualize bootstrap
% MATLAB code to illustrate bootstrap
gmm = gmdistribution([0; 5], cat(3,1,1), [0.3 0.7]);
x = linspace(-5, 10, 1000)';
f = pdf(gmm, x);
figure;
plot(x,f,'LineWidth', 6,'color',[0.1, 0.4, 0.6]);
grid on;
axis([-5 10 0 0.35]);
legend('Population', 'Location', 'NW');
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);
% Visualize bootstrap dataset
N = 100;
X = random(gmm, N);
figure;
[num,val] = hist(X, linspace(-5,10,30));
bar(val, num,'FaceColor',[0.1, 0.4, 0.6]);
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);
for i=1:3
% Bootstrap dataset
idx = randi(N,[1, N]);
Xb1 = X(idx);
figure;
[num,val] = hist(Xb1, linspace(-5,10,30));
bar(val, num,'FaceColor',[0.1, 0.4, 0.6]);
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);
end
% Bootstrap
K = 1000;
Thetahat = zeros(1,K);
for i=1:K % repeat K times
idx = randi(N,[1, N]); % sampling w/ replacement
Y = X(idx);
Thetahat(i) = median(Y); % estimator
end
M = mean(Thetahat) % bootstrap mean
V = var(Thetahat) % bootstrap variance
figure;
[num,val] = hist(Thetahat, linspace(3.5,5.5,30));
bar(val, num,'FaceColor',[0.2, 0.2, 0.2]);
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);
using Plots, Distributions, Statistics
gmm = MixtureModel(Normal[Normal(), Normal(5., 1.),], [0.3, 0.7])
x = range(-5., 10., length=1000);
f = pdf(gmm, x);
plot(x, pdf(gmm,x), linewidth=3, label="Population", legend=:topleft)
N = 100
X = rand(gmm, N)
p1 = histogram(X, bins = range(-5,10,length=30), label=false)
plot_array = []
for i=1:3
idx = rand(1:N, N)
Xb1 = X[idx]
push!(plot_array, histogram(Xb1, bins = range(-5.,10.,length=30)))
end
plot(plot_array..., label=false)
K = 1000
thetahat = zeros(K)
for i=1:K
idx = rand(1:N, N)
Y = X[idx]
thetahat[i] = median(Y)
end
M = mean(thetahat)
V = var(thetahat)
histogram(thetahat, bins=range(3.5,5.5,length=30), legend=false)
Bootstrap median
% MATLAB code to estimate a bootstrap variance
X = [72, 69, 75, 58, 67, 70, 60, 71, 59, 65];
N = size(X,2);
K = 1000;
Thetahat = zeros(1,K);
for i=1:K % repeat K times
idx = randi(N,[1, N]); % sampling w/ replacement
Y = X(idx);
Thetahat(i) = median(Y); % estimator
end
M = mean(Thetahat) % bootstrap mean
V = var(Thetahat) % bootstrap variance
import numpy as np
X = np.array([72, 69, 75, 58, 67, 70, 60, 71, 59, 65])
N = X.size
K = 1000
Thetahat = np.zeros(K)
for i in range(K):
idx = np.random.randint(N, size=N)
Y = X[idx]
Thetahat[i] = np.median(Y)
M = np.mean(Thetahat)
V = np.var(Thetahat)
using Statistics
X = [72., 69., 75., 58., 67., 70., 60., 71., 59., 65.]
N = size(X,1)
K = 1000
thetahat = zeros(K)
for i=1:K
idx = rand(1:N, N)
Y = X[idx]
thetahat[i] = median(Y)
end
M = mean(thetahat)
V = var(thetahat)
X <- c(72, 69, 75, 58, 67, 70, 60, 71, 59, 65)
N = length(X)
K = 1000
Thetahat = rep(0, K)
for (i in 0:K) {
idx = sample(N,5)
Y = X[idx]
Thetahat[i] = median(Y)
}
M = mean(Thetahat)
V = var(Thetahat)
M
V
Chapter 9.3 Hypothesis Testing
Estimate Z-value
% MATLAB command to estimate the Z_hat value.
Theta_hat = 0.29; % Your estimate
theta = 0.25; % Your hypothesis
sigma = sqrt(theta*(1-theta)); % Known standard deviation
N = 1000; % Number of samples
Z_hat = (Theta_hat - theta)/(sigma/sqrt(N));
import numpy as np
Theta_hat = 0.29
theta = 0.25
N = 1000
sigma = np.sqrt(theta*(1-theta))
Z_hat = (Theta_hat - theta)/(sigma / np.sqrt(N))
print(Z_hat)
thetahat = 0.29
theta = 0.25
sigma = sqrt(theta*(1. - theta))
N = 1000
Z_hat = (thetahat - theta)/(sigma/sqrt(N))
Theta_hat = 0.29
theta = 0.25
N = 1000
sigma = sqrt(theta*(1-theta))
Z_hat = (Theta_hat - theta)/(sigma / sqrt(N))
Z_hat
Compute critical value
% MATLAB code to compute the critical value
alpha = 0.05;
z_alpha = icdf('norm', 1-alpha, 0, 1);
import scipy.stats as stats
alpha = 0.05
z_alpha = stats.norm.ppf(1-alpha, 0, 1)
using Distributions
alpha = 0.05
z_alpha = quantile(Normal(), 1-alpha)
alpha = 0.05
z_alpha = qnorm(1-alpha, 0, 1)
z_alpha
Compute p-value
% MATLAB code to compute the p-value
p = cdf('norm', -1.92, 0, 1);
import scipy.stats as stats
p = stats.norm.cdf(-1.92,0,1)
using Distributions
p = cdf(Normal(), -1.92)
Chapter 9.5 ROC Curve
Plot an ROC curve
% MATLAB code to plot ROC curve
sigma = 2; mu = 3;
alphaset = linspace(0,1,1000);
PF1 = zeros(1,1000); PD1 = zeros(1,1000);
PF2 = zeros(1,1000); PD2 = zeros(1,1000);
for i=1:1000
alpha = alphaset(i);
PF1(i) = alpha;
PD1(i) = alpha;
PF2(i) = alpha;
PD2(i) = 1-normcdf(norminv(1-alpha)-mu/sigma);
end
figure;
plot(PF1, PD1,'LineWidth', 4, 'Color', [0.8, 0, 0]); hold on;
plot(PF2, PD2,'LineWidth', 4, 'Color', [0, 0, 0]); hold off;
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
sigma = 2; mu = 3;
alphaset = np.linspace(0,1,1000)
PF1 = np.zeros(1000); PD1 = np.zeros(1000)
PF2 = np.zeros(1000); PD2 = np.zeros(1000)
for i in range(1000):
alpha = alphaset[i]
PF1[i] = alpha
PD1[i] = alpha
PF2[i] = alpha
PD2[i] = 1-stats.norm.cdf(stats.norm.ppf(1-alpha)-mu/sigma)
plt.plot(PF1,PD1)
plt.plot(PF2,PD2)
using Distributions, Plots, Statistics
sigma = 2.
mu = 3.
PF1 = zeros(1000); PD1 = zeros(1000)
PF2 = zeros(1000); PD2 = zeros(1000)
alphaset = range(0.,1.,length=1000)
d = Normal()
for i=1:1000
alpha = alphaset[i]
PF1[i] = alpha
PD1[i] = alpha
PF2[i] = alpha
PD2[i] = 1. - cdf(d, quantile(d, 1-alpha)-mu/sigma)
end
plot(PF1, PD1, linewidth=3, label = "Blind guess")
plot!(PF2, PD2, linewidth=3, label = "Neyman-Pearson", legend=:bottomright)
library(matlab)
sigma = 2
mu = 3
alphaset <- linspace(0,1,1000)
PF1 = rep(0, 1000)
PD1 = rep(0, 1000)
PF2 = rep(0, 1000)
PD2 = rep(0, 1000)
for (i in 0:1000) {
alpha = alphaset[i]
PF1[i] = alpha
PD1[i] = alpha
PF2[i] = alpha
PD2[i] = 1-pnorm(qnorm(1-alpha)-mu/sigma)
}
plot(PF1,PD1,type = "l", col = "red")
lines(PF2, PD2, col = "black")
Computer area under curve
% MATLAB
auc1 = sum(PD1.*[0 diff(PF1)])
auc2 = sum(PD2.*[0 diff(PF2)])
auc1 = np.sum(PD1 * np.append(0, np.diff(PF1)))
auc2 = np.sum(PD2 * np.append(0, np.diff(PF2)))
auc1 = sum(PD1.*[0.; diff(PF1)])
auc2 = sum(PD2.*[0.; diff(PF2)])
auc1 = sum(PD1 * c(0, diff(PF1)))
auc2 = sum(PD2 * c(0, diff(PF2)))
auc1
auc2
Another ROC curve
% MATLAB code to generate the ROC curve.
sigma = 2; mu = 3;
PF1 = zeros(1,1000); PD1 = zeros(1,1000);
PF2 = zeros(1,1000); PD2 = zeros(1,1000);
alphaset = linspace(0,0.5,1000);
for i=1:1000
alpha = alphaset(i);
PF1(i) = 2*alpha;
PD1(i) = 1-(normcdf(norminv(1-alpha)-mu/sigma)-...
normcdf(-norminv(1-alpha)-mu/sigma));
end
alphaset = linspace(0,1,1000);
for i=1:1000
alpha = alphaset(i);
PF2(i) = alpha;
PD2(i) = 1-normcdf(norminv(1-alpha)-mu/sigma);
end
figure;
plot(PF1, PD1,'LineWidth', 4, 'Color', [0.8, 0, 0]); hold on;
plot(PF2, PD2,'LineWidth', 4, 'Color', [0, 0, 0]); hold off;
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
sigma = 2; mu = 3;
PF1 = np.zeros(1000); PD1 = np.zeros(1000)
PF2 = np.zeros(1000); PD2 = np.zeros(1000)
alphaset = np.linspace(0,0.5,1000)
for i in range(1000):
alpha = alphaset[i]
PF1[i] = 2*alpha
PD1[i] = 1-(stats.norm.cdf(stats.norm.ppf(1-alpha)-mu/sigma) \
-stats.norm.cdf(-stats.norm.ppf(1-alpha)-mu/sigma))
alphaset = np.linspace(0,1,1000)
for i in range(1000):
alpha = alphaset[i]
PF2[i] = alpha
PD2[i] = 1-stats.norm.cdf(stats.norm.ppf(1-alpha)-mu/sigma)
plt.plot(PF1, PD1)
plt.plot(PF2, PD2)
using Distributions, Plots
sigma = 2.
mu = 3.
PF1 = zeros(1000); PD1 = zeros(1000)
PF2 = zeros(1000); PD2 = zeros(1000)
alphaset = range(0.,0.5,length=1000)
d = Normal()
for i=1:1000
alpha = alphaset[i]
PF1[i] = 2. *alpha
PD1[i] = 1. - (cdf(d, quantile(d,1. - alpha) - mu/sigma) -
cdf(d, -quantile(d, 1. -alpha) - mu/sigma))
end
alphaset = range(0.,1.,length=1000)
for i=1:1000
alpha = alphaset[i]
PF2[i] = alpha
PD2[i] = 1. - cdf(d, quantile(d, 1-alpha)-mu/sigma)
end
args = (linewidth=3, xlabel = "p_F", ylabel="p_D")
plot(PF1, PD1; args..., label = "Proposed decision")
plot!(PF2, PD2; args..., label = "Neyman-Pearson", legend=:bottomright)
sigma = 2
mu = 3
PF1 = rep(0, 1000)
PD1 = rep(0, 1000)
PF2 = rep(0, 1000)
PD2 = rep(0, 1000)
alphaset <- linspace(0,0.5,1000)
for (i in 0:1000) {
alpha = alphaset[i]
PF1[i] = 2*alpha
PD1[i] = 1-(pnorm(qnorm(1-alpha)-mu/sigma) - pnorm(-qnorm(1-alpha)-mu/sigma))
}
alphaset = linspace(0,1,1000)
for (i in 0:1000) {
alpha = alphaset[i]
PF2[i] = alpha
PD2[i] = 1-pnorm(qnorm(1-alpha)-mu/sigma)
}
plot(PF1, PD1, type = "l", col = "red")
lines(PF2, PD2, col = "black")
ROC on real data
Data download: ch9_ROC_example_data.txt (790KB)
% MATLAB: construct data
% Do not worry if you cannot understand this code.
% It is not the focus on this book.
load fisheriris
pred = meas(51:end,1:2);
resp = (1:100)'>50;
mdl = fitglm(pred,resp,'Distribution','binomial','Link','logit');
scores = mdl.Fitted.Probability;
labels = [ones(1,50), zeros(1,50)];
save('ch9_ROC_example_data','scores','labels');
% MATLAB code to generate an empirical ROC curve
load ch9_ROC_example_data
tau = linspace(0,1,1000);
for i=1:1000
idx = (scores <= tau(i));
predict = zeros(1,100);
predict(idx) = 1;
true_positive = 0; true_negative = 0;
false_positive = 0; false_negative = 0;
for j=1:100
if (predict(j)==1) && (labels(j)==1)
true_positive = true_positive + 1; end
if (predict(j)==1) && (labels(j)==0)
false_positive = false_positive + 1; end
if (predict(j)==0) && (labels(j)==1)
false_negative = false_negative + 1; end
if (predict(j)==0) && (labels(j)==0)
true_negative = true_negative + 1; end
end
PF(i) = false_positive/50;
PD(i) = true_positive/50;
end
plot(PF, PD, 'LineWidth', 4, 'Color', [0, 0, 0]);
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
scores = np.loadtxt('ch9_ROC_example_data.txt')
labels = np.append(np.ones(50), np.zeros(50))
tau = np.linspace(0,1,1000)
PF = np.zeros(1000)
PD = np.zeros(1000)
for i in range(1000):
idx = scores<= tau[i]
predict = np.zeros(100)
predict[idx] = 1
true_positive = 0; true_negative = 0
false_positive = 0; false_negative = 0
for j in range(100):
if (predict[j]==1) and (labels[j]==1): true_positive += 1
if (predict[j]==1) and (labels[j]==0): false_positive += 1
if (predict[j]==0) and (labels[j]==1): false_negative += 1
if (predict[j]==0) and (labels[j]==0): true_negative += 1
PF[i] = false_positive/50
PD[i] = true_positive/50
plt.plot(PF, PD)
using DelimitedFiles, Plots
scores = readdlm("ch9_ROC_example_data.txt")
scores = scores[:]
tau = range(0.,1.,length=1000)
labels = [ones(50); zeros(50)]
PF = zeros(1000)
PD = zeros(1000)
for i=1:1000
idx = (scores .<= tau[i])
predict = zeros(100)
predict[idx] .= 1.
true_positive = 0; true_negative = 0;
false_positive = 0; false_negative = 0;
for j=1:100
if (predict[j]==1) && (labels[j]==1)
true_positive += 1 end
if (predict[j]==1) && (labels[j]==0)
false_positive += 1 end
if (predict[j]==0) && (labels[j]==1)
false_negative += 1 end
if (predict[j]==0) && (labels[j]==0)
true_negative += 1 end
end
PF[i] = false_positive/50
PD[i] = true_positive/50
end
plot(PF, PD, linewidth=3, legend=false)
scores <- c(0.8271,
0.6045, 0.7916, 0.1608, 0.6112, 0.2555, 0.5682, 0.0599, 0.6644, 0.1129,
0.0615, 0.3525, 0.3227, 0.4334, 0.2281, 0.722, 0.2353, 0.285, 0.4107, 0.2008,
0.3712, 0.4235, 0.4876, 0.4235, 0.5751, 0.6734, 0.7356, 0.7138, 0.3874,
0.2404, 0.1663, 0.1663, 0.285, 0.3684, 0.1738, 0.4364, 0.722, 0.4675, 0.2353,
0.172, 0.1779, 0.4434, 0.2769, 0.0689, 0.2141, 0.2712, 0.2633, 0.4806,
0.0885, 0.2555, 0.5682, 0.285, 0.8422, 0.5281, 0.6303, 0.9325, 0.0622,
0.8823, 0.6707, 0.8917, 0.6489, 0.5552, 0.751, 0.2331, 0.2933, 0.6045,
0.6303, 0.9585, 0.9343, 0.3227, 0.7982, 0.221, 0.9391, 0.5079, 0.7379, 0.875,
0.4705, 0.4434, 0.5652, 0.8659, 0.897, 0.9713, 0.5652, 0.518, 0.4039, 0.9435,
0.5781, 0.5947, 0.397, 0.7916, 0.722, 0.7916, 0.285, 0.7659, 0.7379, 0.7138,
0.4876, 0.6303, 0.5311, 0.3525)
labels = append(rep(1,50), rep(0, 50))
tau = linspace(0,1,1000)
PF = rep(0, 1000)
PD = rep(0, 1000)
for (i in 0:1000) {
idx = scores <= tau[i]
predict = rep(0,100)
predict[idx] = 1
true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0
for (j in 0:100) {
a = predict[j]
b = labels[j]
cond1 = FALSE
cond2 = FALSE
if (a == 1) cond1 = TRUE
if (b == 1) {
cond2 = TRUE
}
if (cond1 == cond2 && cond1 == TRUE) {
true_positive = true_positive + 1
}
if (predict[j]==1 && labels[j]==0) {
false_positive = false_positive + 1
}
if (predict[j]==0 && labels[j]==1) {
false_negative = false_negative + 1
}
if (predict[j]==0 && labels[j]==0) {
true_negative = true_negative + 1
}
}
PF[i] = false_positive/50
PD[i] = true_positive/50
}
plot(PF, PD)
|