from read_data_fx import read_tb_data
data=read_tb_data(1,20) # data from problem 1.20
Y=data[:,0] # total time spent in servicing (min)
X=data[:,1] # No. of copier serviced
import numpy as np
import matplotlib.pylab as plt
from scipy import stats as st
def lin_reg(X,Y):
barX=np.mean(X); barY=np.mean(Y)
XminusbarX=X-barX; YminusbarY=Y-barY
b1=sum(XminusbarX*YminusbarY)/sum(XminusbarX**2)
b0=barY-b1*barX
Yhat=b0+b1*X
e_i=Y-Yhat
sse=np.sum(e_i**2)
ssr=np.sum((Yhat-barY )**2)
return {'Yhat':Yhat,'b0':b0,'b1':b1,'e_i': e_i,'sse':sse,'ssr':ssr}
# plott the fitted line and data
import matplotlib.pyplot as plt
%matplotlib inline
linreg=lin_reg(X,Y)
b0=linreg['b0'];b1=linreg['b1']; Yhat=linreg['Yhat'];
plt.figure(figsize=(10,8))
plt.plot(X,Y,'bo',ms=20)
plt.plot(X,Yhat ,'r-',lw=5);
plt.xlim(0.1,11); plt.ylim(0,170);
plt.xlabel('No. of copier serviced',fontsize=15);
plt.ylabel('Total time spent in servicing (min) ',fontsize=15);
We are given that mean required time should not increase by more than 14 minutes for each additional copier that is serviced on a service call. This implies we need to test the following alternatives:
$H_0: \beta_1 \leq 14$
$H_a: \beta_1 \gt 14$
The test statistics for this case will be
$t^{*}=\frac{b_1-14}{s\{b_1\}}$
Where $s\{b_1\}=\sqrt{ \frac{MSE}{\sum{ {(X_i-\overline{X})}^2 } }}$
Decision rules for t:
If $t^* \leq t(1-\alpha;n-2)$ then conclude $H_0$
If $t^* \gt t(1-\alpha;n-2)$ then conclude $H_a$
n=len(X); sse=linreg['sse']; XminusbarX=X-np.mean(X)
MSE=sse/np.float32(n-2)
ssb1=np.sqrt(MSE/sum(XminusbarX**2))
tstar=(b1-14.0)/ssb1;
from scipy import stats
# calculating t value
t1=stats.t.ppf(1-0.05,n-2)
# fancy display of realtionships
tstar0=format(tstar,'.3f');t0=format(t1,'.3f');
from IPython.display import display, Math, Latex
display(Math(r't^*='+str(tstar0)))
display(Math(r't='+str(t0)))
We observe that $t^* \gt t$ therefore we conclude $H_a$
The P-value = $P\{t(n-2) \gt t^*\}$
pval=1.0-stats.t.cdf(tstar,n-2);
pval0=format(pval,'.3f')
display(Math(r'p-value='+str(pval0)))
We observe p-value is less than the significance level of 0.05 hence we conclude $H_a$ directly
barX=np.mean(X)
s_of_yh_hat=np.sqrt(MSE*(1.0/n+(X-barX)**2/sum(XminusbarX**2)))
W=np.sqrt(2.0*stats.f.ppf(0.90,2,n-2))
cb_upper=Yhat+W*s_of_yh_hat
cb_lower=Yhat-W*s_of_yh_hat
plt.figure(figsize=(10,8))
plt.plot(X,Y,'ko',ms=20)
plt.plot(X,Yhat ,'r-',lw=5);
plt.xlim(0.1,11); plt.ylim(0,170);
plt.xlabel('No. of copier serviced',fontsize=15);
plt.ylabel('Total time spent in servicing (min) ',fontsize=15);
# X values are not sorted in this example, sort it to plot the confidence band properly
idx=np.argsort(X)
plt.plot(X[idx],cb_lower[idx],'b--',linewidth=5)
plt.plot(X[idx],cb_upper[idx],'b--',linewidth=5);
Xh=6.0
s_of_pred=np.sqrt(MSE*(1.0+1.0/n+(Xh-barX)**2/sum(XminusbarX**2)))
t1=stats.t.ppf(1-0.1/2.0,n-2)
Yhat_h=b0+b1*Xh
print Yhat_h,s_of_pred,t1
ub0=Yhat_h+t1*s_of_pred ;lb0=Yhat_h-t1*s_of_pred; ub0 = format(ub0,'.2f'); lb0=format(lb0, '.2f')
from IPython.display import display, Math, Latex
display(Math(str(lb0)+r' \leq \hat{Y}_h \leq'+str(ub0)))
Breusch-Pagan test assumes that the variance of the term $\epsilon_i$ denoted by $\sigma_i^2$ is related to the level of $X$ in the following way:
$$\ln \sigma_i^2 = \gamma_0+\gamma_1X_i$$Alternatives in this test are:
$H_0 : \gamma_1=0$ (Constant error variance)
$H_a: \gamma_1 \neq 0$ (Not Constant error variance)
Test statistics is
$\chi_{BP}^2=\frac{SSR^*}{2} \div \bigg(\frac{SSE}{n}\bigg)^2$; $SSR^*$ is regression sum of squares when regressing $e^2$ on $X$.
We know $\chi_{BP}^2 \sim \chi(1)$
Decision rules are:
$\chi_{BP}^2 \leq \chi^2(1-\alpha,1)$ conclude $H_0$
$\chi_{BP}^2 \gt \chi^2(1-\alpha,1)$ conclude $H_a$
# Breusch-Pagan Test
sse=lin_reg(X,Y)['sse'];e_i=lin_reg(X,Y)['e_i']
ssrstar=lin_reg(X,e_i**2)['ssr']
Xchi2=(ssrstar/2)/(sse/n)**2
Xchi02 = format(Xchi2,'.3f')
display(Math(r'\chi_{BP}^2='+str(Xchi02)))
chisq=stats.chi2.ppf(0.95,1)
chisq = format(chisq,'.3f')
display(Math(r'\chi^2='+str(chisq)))
We observe that $\chi_{BP}^2 \lt \chi^2(1-\alpha,1)$, therefore we conclude $H_0$ i.e., error variance is constant.
def normal_prob_plot(e_i):
n=len(e_i);sse=np.sum(e_i**2)
mse_sqrt=np.sqrt(sse/(n-2.0))
from scipy import stats
kr=stats.rankdata(e_i)
e_ik=np.zeros(n)
e_ik=mse_sqrt*stats.norm.ppf((kr-0.375)/(n+0.25))
return e_ik
def corrX(Y1,Y2):
barY1=np.mean(Y1); barY2=np.mean(Y2)
Y1minusbarY1=Y1-barY1; Y2minusbarY2=Y2-barY2
r12=np.sum(Y1minusbarY1*Y2minusbarY2)/np.sqrt(np.sum(Y1minusbarY1**2)*np.sum(Y2minusbarY2**2))
return r12
e_i=linreg['e_i']
e_ik=normal_prob_plot(e_i)
plt.plot(e_ik,e_i,'ko',ms=10)
plt.xlim(-22,22);
plt.ylim(-30,30);
linreg1=lin_reg(e_ik,e_i)
Yhat_e=linreg1['Yhat'];
plt.plot(e_ik,Yhat_e,'r-',lw=1)
plt.title('Normal probability plot',fontsize=15)
plt.xlabel('Expected',fontsize=15);
plt.ylabel('Residual',fontsize=15);
r12=corrX(e_i,e_ik);
display(Math(r'\rho='+str(r12)))
$H_0:$ Normal
$H_a:$ Not Normal
$\rho \geq \rho_c$ conclude H_0 else conclude $H_a$
$\rho_c$ is the critical values for coefficient of correlation between ordered residuals and expected values under normality when distribution of error terms is normal. TABLE B.6 IN TEXTBOOK OR
$\rho_c=0.974$ for $n=45$ at $\alpha=0.05$
We have $\rho \gt \rho_c$ therefore we conclude $H_0$
Alternatives
$H_0: E\{Y\}=\beta_0+\beta_1X$
$H_a: E\{Y\} \neq \beta_0+\beta_1X$
Test Statistics
$F^*=\frac{MSLF}{MSPE} $
If $F^* \leq F(1-\alpha; c-2,n-c)$ , conclude $H_0$
If $F^* \gt F(1-\alpha; c-2,n-c)$ , conclude $H_a$
$p-value = P\{F^* \lt F \}$
import pandas as pd
X1=np.unique(X)
Yij=[]
for i in range(0,len(X1)):
yij=np.int32(Y[X==X1[i]])
Yij.append(yij)
barYj=[];
for i in range (0,len(X1)):
y0=np.mean(Yij[i])
barYj.append(y0)
df = pd.DataFrame({'$j$':np.arange(1,len(X1)+1), '$X_j$' : X1[:],'$Y_{ij}$': Yij[:],'$\overline{Y_j}$': barYj[:]})
df.set_index('$j$')
# calculate msspe
yijminusbarYj=[];i=0
for yij in Yij:
yijminusbarYj_temp=(yij-barYj[i])**2
yijminusbarYj.append(yijminusbarYj_temp)
i=i+1
yijminusbarYj=np.array(yijminusbarYj)
sspe=0
for i in range(0,len(X1)):
sspe1=np.sum(yijminusbarYj[i])
sspe=sspe+sspe1
n=len(X); c=len(X1)
msspe=sspe/(n-c)
# calculate msslf
sse=lin_reg(X,Y)['sse']
sslf=sse-sspe
mslf=sslf/(c-2.0)
alpha=0.05
Fstar=mslf/msspe
F=stats.f.ppf(1-alpha,c-2,n-c)
# fancy display of realtionships
Fstar0=format(Fstar,'.3f');F0=format(F,'.3f');
display(Math(r'F^*='+str(Fstar0)))
display(Math(r'F='+str(F0)))
We observe $F^* \lt F$ therefore we conclude $H_0$. That is regression function is not linear