# read the data
from read_data_fx import read_tb_data
data=read_tb_data(1,28)
# Percentage of individuals with high school diploma is X
X=data[:,1]
# Crime rate is Y
Y=data[:,0];
# import the packages required
import numpy as np
import scipy as sc
import statsmodels.api as sm
import matplotlib.pyplot as plt
# Fit a line to the data
barX=np.mean(X); barY=np.mean(Y)
XminusbarX=X-barX; YminusbarY=Y-barY
b1=sum(XminusbarX*YminusbarY)/sum(XminusbarX**2)
b0=barY-b1*barX
Yhat=b0+b1*X
e_i=Y-Yhat # Residuals
print 'b0=%4.4f' %b0,'b1=%4.4f' %b1
# Plot the line and crosscheck all the properties
sum_of_residuals = np.sum(e_i)
sum_of_squares_of_residuals = np.sum(e_i**2)
print 'sum of residuals = %4.4f' % sum_of_residuals, ' ---property 1'
print 'sum of squares of residuals %4.4f' %sum_of_squares_of_residuals, ' ---property 2'
print 'sum of Y_i %4.4f' % np.sum(Y), ' ---property 3'
print 'sum of Yhat %4.4f' % np.sum(Yhat), ' ---property 3'
print 'sum of X_ie_i %4.4f' % np.sum(X*e_i), ' ---property 4'
print 'sum of Yhat e_i %4.4f' % np.sum(Yhat*e_i), ' ---property 5'
#make matplotlib inline
%matplotlib inline
plt.figure(figsize=(20,10))
# scatter plot
plt.scatter(X,Y,c='k',s=220,marker='d')
plt.xlabel('Percentage of individuals with high school diploma',fontsize=25)
# ylabel of the scatter plot
plt.ylabel('Crime rate',fontsize=25)
# title of the scatter plot
plt.title('Education Vs. Crime',fontsize=25)
# regression line
# point barX and barY
plt.plot(barX,barY,marker='*',markersize=70,color='b')
#regression line
plt.plot(X,Yhat,'g-',linewidth=3)
print 'Blue point represents the point xbar and ybar -- property 6'
To test whether or not there is a linear relationship we will use the following decision rule.
$H_0: \beta_1=0$ (No linear relationship)
$H_a; \beta_1 \neq 0$ (Linear relationship)
If $|t^*| \leq t(1-\alpha/2;n-2)$ then conclude $H_0$
If $|t^*| \gt t(1-\alpha/2;n-2)$ then conclude $H_a$
where $t^*=\frac{b_1}{s\{b_1\}}$
# We have already calculated b1, but we need to calculate sb1
n=len(X)
MSE=sum_of_squares_of_residuals/(n-2)
ssb1=np.sqrt(MSE/sum(XminusbarX**2))
# Packages for importing t-statistics
from scipy import stats
# calculating t value; we are given \alpha =0.01
t1=stats.t.ppf(0.995,n-2)
tstar=b1/ssb1
print '|tstar|= %4.4f'% abs(tstar) ,'and', 't=%4.4f' % t1
We observe that $|t^*| \gt t(1-\alpha/2;n-2)$, therefore we conclude $H_a$
Two sided p-value is given by $2P\{t(n-2) \gt t^*\}$
pval=1.0-stats.t.cdf(abs(tstar),n-2); pval=2.0*pval; print 'Two sided p-value = %4.4f' % pval
p-value $<\alpha$ therefore we can directly conclude $H_a$ i.e., there is a linear relationship between crime rate and education level
$1-\alpha$ confidence interval for $\beta_1$ is given by:
$b_1 \pm t(1-\alpha/2;n-2)s\{b_1\}$.
Where $t(1-\alpha/2;n-2)$ denotes $(1-\alpha/2)100$ percentile of the $t$ distribution. We will set $\alpha=0.01$ i.e., we will attempt to find $99\%$ confidence interval.
tssb=t1*ssb1
print 'confidence interval for beta_1 is %4.4f' % b1, '+/-', '%4.4f' % tssb ,'i.e.,'
llb0=b1-tssb; upl0=b1+tssb;lra=tssb/b1
print '%4.4f <= beta_1'% llb0, '<= %4.4f' % upl0
Therefore, we can infer that $99\%$ confidence interval of $\beta_1$ to be:
$-280.2118 \leq \beta_1 \leq -60.9386$
X0=np.arange(5,30,5)
k=10
Y1=np.zeros(5*k)
X1=np.zeros(5*k)
labels=[];
l=0
for i in range(0,5):
for j in range(0,k):
epsilon_1=np.random.normal(0,1,1)
Y1[l] =0.2+0.95*X0[i]+epsilon_1[0]
X1[l]=X0[i]
l=l+1
%matplotlib inline
plt.figure(figsize=(15,7))
plt.subplot(1,2,1)
plt.scatter(X1,Y1,c='r',s=80)
plt.xlabel(r'$X$',fontsize=25)
plt.ylabel(r'$Y$',fontsize=25)
plt.xlim(3,28)
k=25; Y2=np.zeros((5,k))
epsilon_1=np.random.normal(0,1,k)
labels=range(5,30,5);
xd=[]
for i in range(0,5):
Y2[i,:]=0.2+0.95*X0[i]+epsilon_1[:]
xd.append(Y2[i,:].tolist())
plt.subplot(1,2,2)
plt.boxplot(xd,labels=labels,sym='ro',whis=1.5);
plt.xlabel(r'$X$',fontsize=25)
plt.ylabel(r'$Y$',fontsize=25)
plt.tight_layout();
Maximum likelihood estimate of the variance is given by
$\hat\sigma=\frac{\sum(Y_i-\hat{Y_i})^2}{n}$
We are given by $\hat{Y}_i=0.25+ 0.9X_i$, and $Y_i=\beta_0+\beta_1X_i+ \epsilon_i $. Where $\epsilon_i$ are independent and are distributed as $N(0,1)$. We are given $\beta_0=0.2$, $\beta_1=0.95$, $i=1,...,5$ and in these $5$ trials $X_i=\{5,10,15,20,25\}$. We are also given that for every trial $i$ there are $k=10$ observations of $Y_i$
b0=0.25;b1=0.9
Yhat=b0+b1*X1
l=0;k=10
for i in range(0,5):
for j in range(0,k):
epsilon_1=np.random.normal(0,1,1)
Y1[l] =0.2+0.95*X0[i]+epsilon_1[0]
X1[l]=X0[i]
l=l+1
e_i=Y1-Yhat
n=len(X1)
masigma=sum(e_i**2)/n
print masigma