Python, MATLAB, Julia, R code: Chapter 9

Acknowledgement: The Julia code is written by the contributors listed here.

Chapter 9.1 Confidence Interval

Histogram of the sample average

% MATLAB code
gmm = gmdistribution([0; 5], cat(3,1,1), [0.3 0.7]);
x = linspace(-5, 10, 1000)';
f = pdf(gmm, x);
figure;
plot(x,f,'LineWidth', 6,'color',[0.1, 0.4, 0.6]);
grid on;
axis([-5 10 0 0.35]);
legend('Population', 'Location', 'NW');
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);

for i=1:4
    figure;
    bin = -5:10;
    Y = random(gmm, 50);
    [num,val] = hist(Y,bin);
    bar(val, num,'FaceColor',[0.1, 0.4, 0.6]);
    tt = sprintf('Mean = %3.2f', mean(Y));
    legend(tt, 'Location', 'NW');
    grid on;
    axis([-5 10 0 20]);
    set(gcf, 'Position', [100, 100, 600, 300]);
    set(gca,'FontWeight','bold','fontsize',14);
end

bin = 2:0.1:5;
for i=1:10000
    Y(:,i) = random(gmm, 50);
end
M = mean(Y);
[num,val] = hist(M,bin);
bar(val, num,'FaceColor',[0.2, 0.2, 0.2]);
grid on;
axis([2 5 0 1200]);
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);
legend('Sample Average');
# Julia: Histogram of the sample average
using Distributions, Plots, Printf
gmm = MixtureModel(Normal[Normal(), Normal(5., 1.),], [0.3, 0.7])
x = range(-5., 10., length=1000)
# the population distribution
plot(x, pdf(gmm, x), linewidth=3, label=false, title="Population distribution")
# histograms of 4 random samples of size 50
plot_array = []
for i=1:4
    bin = -5:10
    Y = rand(gmm, 50)
    tt = @sprintf("Mean = %3.2f", mean(Y))
    push!(plot_array, histogram(Y, bins=bin, label = tt, legend=:topleft))
end
plot(plot_array...)
# histogram of sample average
M = zeros(10000)
bin = range(2., 5., step=0.1)
for i = 1:10000
    M[i] = mean(rand(gmm, 50))
end
histogram(M, bins=bin, label=false, title="Histogram of the sample average")

Compute confidence interval

Given alpha, compute the width of the confidence interval:

 epsilon ge Phi^{-1}left(1-frac{alpha}{2}right)
% MATLAB code to compute the width of the confidence interval
alpha = 0.05;
mu = 0; sigma = 1; % Standard Gaussian
epsilon = icdf('norm',1-alpha/2,mu,sigma)
# Python code to compute the width of the confidence interval
import scipy.stats as stats
alph = 0.05;
mu   = 0; sigma = 1; # Standard Gaussian
epsilon = stats.norm.ppf(1-alph/2, mu, sigma)
print(epsilon)
# Julia code to compute the width of the confidence interval
using Distributions
alpha = 0.05
mu = 0.
sigma = 1.
epsilon = quantile(Normal(mu, sigma ), 1. - alpha/2.) # CI is estimator ± epsilon

Visualize the t-distribution

% MATLAB code to plot the t-distribution
x = linspace(-5,5,100);
p1 = pdf('norm',x,0,1);
p2 = pdf('t',x,11-1);
p3 = pdf('t',x,3-1);
p4 = pdf('t',x,2-1);

figure;
plot(x,p1,'-.','LineWidth',4,'Color',[0 0.6 0]); hold on;
plot(x,p2,'-o','LineWidth',2,'Color',[1 0.6 0.2]);
plot(x,p3,'-^','LineWidth',2,'Color',[0.2 0.2 0.7]);
plot(x,p4,'-s','LineWidth',2,'Color',[0.7 0.2 0.2]);
legend('Gaussian(0,1)', 't-dist, N = 11', 't-dist, N = 3', 't-dist, N = 2');
grid on;
xticks(-5:1:5);
yticks(0:0.05:0.4);
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);
# Julia code to plot the t-distribution
using Distributions, Plots
x = range(-5.,5.,length=100)
p1 = pdf(Normal(),x)
p2 = pdf(TDist(11-1),x)
p3 = pdf(TDist(3-1),x)
p4 = pdf(TDist(2-1),x)
plot([p1,p2,p3,p4], label = ["Gaussian(0,1)" "t-dist, N = 11" "t-dist, N = 3" "t-dist, N = 2"], legend=:topright)

Construct a confidence interval from data

# Python code to generate a confidence interval
import numpy as np
import scipy.stats as stats
x = np.array([72, 69, 75, 58, 67, 70, 60, 71, 59, 65])
N         = x.size
Theta_hat = np.mean(x) # Sample mean
S_hat     = np.std(x)  # Sample standard deviation
nu        = x.size-1   # degrees of freedom
alpha     = 0.05       # confidence level
z    = stats.t.ppf(1-alph/2, nu)
CI_L = Theta_hat-z*S_hat/np.sqrt(N)
CI_U = Theta_hat+z*S_hat/np.sqrt(N)
print(CI_L, CI_U)
# Julia code to generate a confidence interval
using Statistics, Distributions, Printf
x = [72., 69., 75., 58., 67., 70., 60., 71., 59., 65.]
thetahat = mean(x)  # Sample mean
sigmahat = std(x)   # Sample standard deviation
N = size(x,1)
nu = N-1 # degrees of freedom
alpha = 0.05        # confidence level
z = quantile(TDist(nu), 1. - alpha/2.)
CI_L = thetahat - z*sigmahat/sqrt(N)
CI_U = thetahat + z*sigmahat/sqrt(N)
@printf("Confidence interval: [%5.3f - %5.3f]", CI_L, CI_U)

Chapter 9.2 Bootstrap

Visualize bootstrap

% MATLAB code to illustrate bootstrap
gmm = gmdistribution([0; 5], cat(3,1,1), [0.3 0.7]);
x = linspace(-5, 10, 1000)';
f = pdf(gmm, x);
figure;
plot(x,f,'LineWidth', 6,'color',[0.1, 0.4, 0.6]);
grid on;
axis([-5 10 0 0.35]);
legend('Population', 'Location', 'NW');
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);

% Visualize bootstrap dataset
N = 100;
X = random(gmm, N);
figure;
[num,val] = hist(X, linspace(-5,10,30));
bar(val, num,'FaceColor',[0.1, 0.4, 0.6]);
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);

for i=1:3
    % Bootstrap dataset
    idx = randi(N,[1, N]);
    Xb1 = X(idx);
    figure;
    [num,val] = hist(Xb1, linspace(-5,10,30));
    bar(val, num,'FaceColor',[0.1, 0.4, 0.6]);
    set(gcf, 'Position', [100, 100, 600, 300]);
    set(gca,'FontWeight','bold','fontsize',14);
end

% Bootstrap
K = 1000;
Thetahat = zeros(1,K);
for i=1:K                       % repeat K times
    idx = randi(N,[1, N]);      % sampling w/ replacement
    Y = X(idx);
    Thetahat(i) = median(Y);    % estimator
end
M = mean(Thetahat)              % bootstrap mean
V = var(Thetahat)               % bootstrap variance
figure;
[num,val] = hist(Thetahat, linspace(3.5,5.5,30));
bar(val, num,'FaceColor',[0.2, 0.2, 0.2]);
set(gcf, 'Position', [100, 100, 600, 300]);
set(gca,'FontWeight','bold','fontsize',14);
# Julia code to illustrate bootstrap
using Plots, Distributions, Statistics
gmm = MixtureModel(Normal[Normal(), Normal(5., 1.),], [0.3, 0.7])
# visualize the population
x = range(-5., 10., length=1000);
f = pdf(gmm, x);
plot(x, pdf(gmm,x), linewidth=3, label="Population", legend=:topleft)
# Visualize bootstrap dataset, the real sample data
N = 100
X = rand(gmm, N) # the real sample data
p1 = histogram(X, bins = range(-5,10,length=30), label=false)
# visualize 3 bootstrap samples
plot_array = []
for i=1:3
    # Bootstrap dataset
    idx = rand(1:N, N)
    Xb1 = X[idx]
    push!(plot_array, histogram(Xb1, bins = range(-5.,10.,length=30)))
end
plot(plot_array..., label=false)

# Bootstrap the median
K = 1000 # number of bootstrap re-samples
thetahat = zeros(K)
for i=1:K               # repeat K times
    idx = rand(1:N, N)  # sampling w/ replacement
    Y = X[idx]
    thetahat[i] = median(Y) # estimator
end
M = mean(thetahat)          # bootstrap mean
V = var(thetahat)           # bootstrap variance
# histogram of bootstrap replications of estimator
histogram(thetahat, bins=range(3.5,5.5,length=30), legend=false)

Bootstrap median

% MATLAB code to estimate a bootstrap variance
X = [72, 69, 75, 58, 67, 70, 60, 71, 59, 65];
N = size(X,2);
K = 1000;
Thetahat = zeros(1,K);
for i=1:K                       % repeat K times
    idx = randi(N,[1, N]);      % sampling w/ replacement
    Y = X(idx);
    Thetahat(i) = median(Y);    % estimator
end
M = mean(Thetahat)              % bootstrap mean
V = var(Thetahat)               % bootstrap variance
# Python code to estimate a bootstrap variance
import numpy as np
X = np.array([72, 69, 75, 58, 67, 70, 60, 71, 59, 65])
N = X.size
K = 1000
Thetahat = np.zeros(K)
for i in range(K):
  idx = np.random.randint(N, size=N)
  Y = X[idx]
  Thetahat[i] = np.median(Y)
M = np.mean(Thetahat)
V = np.var(Thetahat)
# Julia code to estimate a bootstrap variance
using Statistics
X = [72., 69., 75., 58., 67., 70., 60., 71., 59., 65.]
N = size(X,1)
K = 1000
thetahat = zeros(K)
for i=1:K                       # repeat K times
    idx = rand(1:N, N)          # sampling w/ replacement
    Y = X[idx]
    thetahat[i] = median(Y)         # estimator
end
M = mean(thetahat)                  # bootstrap mean
V = var(thetahat)                   # bootstrap variance

Chapter 9.3 Hypothesis Testing

Estimate Z-value

% MATLAB command to estimate the Z_hat value.
Theta_hat = 0.29;                    % Your estimate
theta    = 0.25;                     % Your hypothesis
sigma    = sqrt(theta*(1-theta));    % Known standard deviation
N        = 1000;                     % Number of samples
Z_hat    = (Theta_hat - theta)/(sigma/sqrt(N));
# Python command to estimate the Z_hat value
import numpy as np
Theta_hat = 0.29                     # Your estimate
theta     = 0.25                     # Your hypothesis
N         = 1000                     # Number of samples
sigma     = np.sqrt(theta*(1-theta)) # Known standard deviation
Z_hat = (Theta_hat - theta)/(sigma / np.sqrt(N))
print(Z_hat)
# Julia Estimate Z-value
thetahat = 0.29                 # Your estimate
theta = 0.25                    # Your hypothesis
sigma = sqrt(theta*(1. - theta))        # Known standard deviation
N = 1000                    # Number of samples
Z_hat = (thetahat - theta)/(sigma/sqrt(N))

Compute critical value

% MATLAB code to compute the critical value
alpha = 0.05;
z_alpha = icdf('norm', 1-alpha, 0, 1);
# Python code to compute the critical value
import scipy.stats as stats
alpha = 0.05
z_alpha = stats.norm.ppf(1-alpha, 0, 1)
# Julia code to compute the critical value
using Distributions
alpha = 0.05
z_alpha = quantile(Normal(), 1-alpha)

Compute p-value

% MATLAB code to compute the p-value
p = cdf('norm', -1.92, 0, 1);
# Python code to compute the p-value
import scipy.stats as stats
p = stats.norm.cdf(-1.92,0,1)
# Julia code to compute the p-value
using Distributions
p = cdf(Normal(), -1.92)

Chapter 9.5 ROC Curve

Plot an ROC curve

% MATLAB code to plot ROC curve
sigma = 2;  mu = 3;
alphaset = linspace(0,1,1000);
PF1 = zeros(1,1000); PD1 = zeros(1,1000);
PF2 = zeros(1,1000); PD2 = zeros(1,1000);
for i=1:1000
    alpha = alphaset(i);
    PF1(i) = alpha;
    PD1(i) = alpha;

    PF2(i) = alpha;
    PD2(i) = 1-normcdf(norminv(1-alpha)-mu/sigma);
end
figure;
plot(PF1, PD1,'LineWidth', 4, 'Color', [0.8, 0, 0]); hold on;
plot(PF2, PD2,'LineWidth', 4, 'Color', [0, 0, 0]); hold off;
# Python code to plot ROC curve
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

sigma = 2; mu = 3;
alphaset = np.linspace(0,1,1000)
PF1 = np.zeros(1000); PD1 = np.zeros(1000)
PF2 = np.zeros(1000); PD2 = np.zeros(1000)
for i in range(1000):
  alpha = alphaset[i]
  PF1[i] = alpha
  PD1[i] = alpha
  PF2[i] = alpha
  PD2[i] = 1-stats.norm.cdf(stats.norm.ppf(1-alpha)-mu/sigma)
plt.plot(PF1,PD1)
plt.plot(PF2,PD2)
# Julia code to plot ROC curve
using Distributions, Plots, Statistics
sigma = 2.
mu = 3.
PF1 = zeros(1000); PD1 = zeros(1000)
PF2 = zeros(1000); PD2 = zeros(1000)
alphaset = range(0.,1.,length=1000)
d = Normal()
for i=1:1000
    alpha = alphaset[i]
    PF1[i] = alpha
    PD1[i] = alpha
    PF2[i] = alpha
    PD2[i] = 1. - cdf(d, quantile(d, 1-alpha)-mu/sigma)
end
plot(PF1, PD1, linewidth=3, label = "Blind guess")
plot!(PF2, PD2, linewidth=3, label = "Neyman-Pearson", legend=:bottomright)
# Computer area under curve
auc1 = sum(PD1.*[0.; diff(PF1)])
auc2 = sum(PD2.*[0.; diff(PF2)])

Computer area under curve

% MATLAB
auc1 = sum(PD1.*[0 diff(PF1)])
auc2 = sum(PD2.*[0 diff(PF2)])
# Python
auc1 = np.sum(PD1 * np.append(0, np.diff(PF1)))
auc2 = np.sum(PD2 * np.append(0, np.diff(PF2)))

Another ROC curve

% MATLAB code to generate the ROC curve.
sigma = 2;  mu = 3;
PF1 = zeros(1,1000); PD1 = zeros(1,1000);
PF2 = zeros(1,1000); PD2 = zeros(1,1000);
alphaset = linspace(0,0.5,1000);
for i=1:1000
    alpha = alphaset(i);
    PF1(i) = 2*alpha;
    PD1(i) = 1-(normcdf(norminv(1-alpha)-mu/sigma)-...
                normcdf(-norminv(1-alpha)-mu/sigma));
end
alphaset = linspace(0,1,1000);
for i=1:1000
    alpha = alphaset(i);
    PF2(i) = alpha;
    PD2(i) = 1-normcdf(norminv(1-alpha)-mu/sigma);
end
figure;
plot(PF1, PD1,'LineWidth', 4, 'Color', [0.8, 0, 0]); hold on;
plot(PF2, PD2,'LineWidth', 4, 'Color', [0, 0, 0]); hold off;
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

sigma = 2; mu = 3;
PF1 = np.zeros(1000); PD1 = np.zeros(1000)
PF2 = np.zeros(1000); PD2 = np.zeros(1000)

alphaset = np.linspace(0,0.5,1000)
for i in range(1000):
  alpha = alphaset[i]
  PF1[i] = 2*alpha
  PD1[i] = 1-(stats.norm.cdf(stats.norm.ppf(1-alpha)-mu/sigma) \
            -stats.norm.cdf(-stats.norm.ppf(1-alpha)-mu/sigma))

alphaset = np.linspace(0,1,1000)
for i in range(1000):
  alpha = alphaset[i]
  PF2[i] = alpha
  PD2[i] = 1-stats.norm.cdf(stats.norm.ppf(1-alpha)-mu/sigma)

plt.plot(PF1, PD1)
plt.plot(PF2, PD2)
# Julia code to generate the ROC curve.
using Distributions, Plots
sigma = 2.
mu = 3.
PF1 = zeros(1000); PD1 = zeros(1000)
PF2 = zeros(1000); PD2 = zeros(1000)
alphaset = range(0.,0.5,length=1000)
d = Normal()
for i=1:1000
    alpha = alphaset[i]
    PF1[i] = 2. *alpha
    PD1[i] = 1. - (cdf(d, quantile(d,1. - alpha) - mu/sigma) -
        cdf(d, -quantile(d, 1. -alpha) - mu/sigma))
end
alphaset = range(0.,1.,length=1000)
for i=1:1000
    alpha = alphaset[i]
    PF2[i] = alpha
    PD2[i] = 1. - cdf(d, quantile(d, 1-alpha)-mu/sigma)
end
args = (linewidth=3, xlabel = "p_F", ylabel="p_D")
plot(PF1, PD1; args..., label = "Proposed decision")
plot!(PF2, PD2; args..., label = "Neyman-Pearson", legend=:bottomright)

ROC on real data

Data download: ch9_ROC_example_data.txt (790KB)

% MATLAB: construct data
% Do not worry if you cannot understand this code.
% It is not the focus on this book.
load fisheriris
pred = meas(51:end,1:2);
resp = (1:100)'>50;
mdl = fitglm(pred,resp,'Distribution','binomial','Link','logit');
scores = mdl.Fitted.Probability;
labels = [ones(1,50), zeros(1,50)];
save('ch9_ROC_example_data','scores','labels');
% MATLAB code to generate an empirical ROC curve
load ch9_ROC_example_data
tau = linspace(0,1,1000);
for i=1:1000
    idx   = (scores <= tau(i));
    predict = zeros(1,100);
    predict(idx)   = 1;
    true_positive  = 0; true_negative  = 0;
    false_positive = 0; false_negative = 0;
    for j=1:100
        if (predict(j)==1) && (labels(j)==1)
            true_positive = true_positive + 1;   end
        if (predict(j)==1) && (labels(j)==0)
            false_positive = false_positive + 1; end
        if (predict(j)==0) && (labels(j)==1)
            false_negative = false_negative + 1; end
        if (predict(j)==0) && (labels(j)==0)
            true_negative = true_negative + 1;   end
    end
    PF(i) = false_positive/50;
    PD(i) = true_positive/50;
end
plot(PF, PD, 'LineWidth', 4, 'Color', [0, 0, 0]);
# Python code to generate an empirical ROC curve
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
scores = np.loadtxt('ch9_ROC_example_data.txt')
labels = np.append(np.ones(50), np.zeros(50))
tau = np.linspace(0,1,1000)
PF = np.zeros(1000)
PD = np.zeros(1000)
for i in range(1000):
  idx = scores<= tau[i]
  predict = np.zeros(100)
  predict[idx] = 1
  true_positive  = 0;   true_negative  = 0
  false_positive = 0;   false_negative = 0
  for j in range(100):
    if (predict[j]==1) and (labels[j]==1): true_positive  += 1
    if (predict[j]==1) and (labels[j]==0): false_positive += 1
    if (predict[j]==0) and (labels[j]==1): false_negative += 1
    if (predict[j]==0) and (labels[j]==0): true_negative  += 1
  PF[i] = false_positive/50
  PD[i] = true_positive/50
plt.plot(PF, PD)
# Julia code to generate an empirical ROC curve
using DelimitedFiles, Plots
scores = readdlm("ch9_ROC_example_data.txt")
scores = scores[:] # a vector
tau = range(0.,1.,length=1000)
labels = [ones(50); zeros(50)]
PF = zeros(1000)
PD = zeros(1000)
for i=1:1000
    idx   = (scores .<= tau[i])
    predict = zeros(100)
    predict[idx]   .= 1.
    true_positive  = 0; true_negative  = 0;
    false_positive = 0; false_negative = 0;
    for j=1:100
        if (predict[j]==1) && (labels[j]==1)
            true_positive += 1   end
        if (predict[j]==1) && (labels[j]==0)
            false_positive += 1 end
        if (predict[j]==0) && (labels[j]==1)
            false_negative += 1 end
        if (predict[j]==0) && (labels[j]==0)
            true_negative += 1   end
    end
    PF[i] = false_positive/50
    PD[i] = true_positive/50
end
plot(PF, PD, linewidth=3, legend=false)