# Solutions for mid-term 1¶

### Problem 1(a)¶

In [1]:
# read the data
# Percentage of individuals with high school diploma is X
X=data[:,1]
# Crime rate is Y
Y=data[:,0];

In [2]:
# import the packages required
import numpy as np
import scipy as sc
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [3]:
# Fit a line to the data
barX=np.mean(X); barY=np.mean(Y)
XminusbarX=X-barX; YminusbarY=Y-barY
b1=sum(XminusbarX*YminusbarY)/sum(XminusbarX**2)
b0=barY-b1*barX
Yhat=b0+b1*X
e_i=Y-Yhat # Residuals
print 'b0=%4.4f' %b0,'b1=%4.4f' %b1

b0=20517.5999 b1=-170.5752

In [4]:
# Plot the line and crosscheck all the properties

sum_of_residuals = np.sum(e_i)
sum_of_squares_of_residuals = np.sum(e_i**2)
print 'sum of residuals = %4.4f' % sum_of_residuals, ' ---property 1'
print 'sum of squares of residuals %4.4f' %sum_of_squares_of_residuals, '  ---property 2'
print 'sum of Y_i %4.4f' % np.sum(Y), '  ---property 3'
print 'sum of Yhat %4.4f' % np.sum(Yhat), '  ---property 3'
print 'sum of X_ie_i %4.4f' % np.sum(X*e_i), '  ---property 4'
print 'sum of Yhat e_i %4.4f' % np.sum(Yhat*e_i), '   ---property 5'

#make matplotlib inline
%matplotlib inline
plt.figure(figsize=(20,10))
# scatter plot
plt.scatter(X,Y,c='k',s=220,marker='d')
plt.xlabel('Percentage of individuals with high school diploma',fontsize=25)
# ylabel of the scatter plot
plt.ylabel('Crime rate',fontsize=25)
# title of the scatter plot
plt.title('Education Vs. Crime',fontsize=25)
# regression line
# point barX and barY
plt.plot(barX,barY,marker='*',markersize=70,color='b')
#regression line
plt.plot(X,Yhat,'g-',linewidth=3)
print 'Blue point represents the point xbar and ybar -- property 6'

sum of residuals = -0.0000  ---property 1
sum of squares of residuals 455273165.2925   ---property 2
sum of Y_i 597341.0000   ---property 3
sum of Yhat 597341.0000   ---property 3
sum of X_ie_i -0.0000   ---property 4
sum of Yhat e_i -0.0000    ---property 5
Blue point represents the point xbar and ybar -- property 6


### Test for Linear relationship¶

To test whether or not there is a linear relationship we will use the following decision rule.

$H_0: \beta_1=0$ (No linear relationship)

$H_a; \beta_1 \neq 0$ (Linear relationship)

If $|t^*| \leq t(1-\alpha/2;n-2)$ then conclude $H_0$

If $|t^*| \gt t(1-\alpha/2;n-2)$ then conclude $H_a$

where $t^*=\frac{b_1}{s\{b_1\}}$

In [5]:
# We have already calculated b1, but we need to calculate sb1
n=len(X)
MSE=sum_of_squares_of_residuals/(n-2)
ssb1=np.sqrt(MSE/sum(XminusbarX**2))

In [6]:
# Packages for importing t-statistics
from scipy import stats
# calculating t value; we are given \alpha =0.01
t1=stats.t.ppf(0.995,n-2)
tstar=b1/ssb1
print '|tstar|= %4.4f'% abs(tstar) ,'and', 't=%4.4f' % t1

|tstar|= 4.1029 and t=2.6371


We observe that $|t^*| \gt t(1-\alpha/2;n-2)$, therefore we conclude $H_a$

### Two sided p-value¶

Two sided p-value is given by $2P\{t(n-2) \gt t^*\}$

In [7]:
pval=1.0-stats.t.cdf(abs(tstar),n-2); pval=2.0*pval; print 'Two sided p-value = %4.4f' % pval

Two sided p-value = 0.0001


p-value $<\alpha$ therefore we can directly conclude $H_a$ i.e., there is a linear relationship between crime rate and education level

### Problem 1(b)¶

$1-\alpha$ confidence interval for $\beta_1$ is given by:

$b_1 \pm t(1-\alpha/2;n-2)s\{b_1\}$.

Where $t(1-\alpha/2;n-2)$ denotes $(1-\alpha/2)100$ percentile of the $t$ distribution. We will set $\alpha=0.01$ i.e., we will attempt to find $99\%$ confidence interval.

In [8]:
tssb=t1*ssb1

print 'confidence interval for beta_1 is %4.4f' % b1, '+/-', '%4.4f' % tssb ,'i.e.,'
llb0=b1-tssb; upl0=b1+tssb;lra=tssb/b1

print '%4.4f <= beta_1'% llb0,  '<= %4.4f' % upl0

confidence interval for beta_1 is -170.5752 +/- 109.6366 i.e.,
-280.2118 <= beta_1 <= -60.9386


Therefore, we can infer that $99\%$ confidence interval of $\beta_1$ to be:

$-280.2118 \leq \beta_1 \leq -60.9386$

### Problem 2(a)¶

In [9]:
X0=np.arange(5,30,5)
k=10
Y1=np.zeros(5*k)
X1=np.zeros(5*k)

labels=[];

l=0
for i in range(0,5):
for j in range(0,k):
epsilon_1=np.random.normal(0,1,1)
Y1[l] =0.2+0.95*X0[i]+epsilon_1[0]
X1[l]=X0[i]
l=l+1

%matplotlib inline

plt.figure(figsize=(15,7))
plt.subplot(1,2,1)
plt.scatter(X1,Y1,c='r',s=80)
plt.xlabel(r'$X$',fontsize=25)
plt.ylabel(r'$Y$',fontsize=25)
plt.xlim(3,28)

k=25; Y2=np.zeros((5,k))
epsilon_1=np.random.normal(0,1,k)
labels=range(5,30,5);

xd=[]
for i in range(0,5):

Y2[i,:]=0.2+0.95*X0[i]+epsilon_1[:]

xd.append(Y2[i,:].tolist())

plt.subplot(1,2,2)
plt.boxplot(xd,labels=labels,sym='ro',whis=1.5);
plt.xlabel(r'$X$',fontsize=25)
plt.ylabel(r'$Y$',fontsize=25)

plt.tight_layout();


### Problem 2 (b)¶

Maximum likelihood estimate of the variance is given by

$\hat\sigma=\frac{\sum(Y_i-\hat{Y_i})^2}{n}$

We are given by $\hat{Y}_i=0.25+ 0.9X_i$, and $Y_i=\beta_0+\beta_1X_i+ \epsilon_i$. Where $\epsilon_i$ are independent and are distributed as $N(0,1)$. We are given $\beta_0=0.2$, $\beta_1=0.95$, $i=1,...,5$ and in these $5$ trials $X_i=\{5,10,15,20,25\}$. We are also given that for every trial $i$ there are $k=10$ observations of $Y_i$

In [10]:
b0=0.25;b1=0.9
Yhat=b0+b1*X1

l=0;k=10
for i in range(0,5):
for j in range(0,k):
epsilon_1=np.random.normal(0,1,1)
Y1[l] =0.2+0.95*X0[i]+epsilon_1[0]
X1[l]=X0[i]
l=l+1

e_i=Y1-Yhat
n=len(X1)

masigma=sum(e_i**2)/n
print masigma

1.4906964476