Solutions for homework sheet1

Problem No. 1

In [1]:
# Import all the packages 
import numpy as np
import scipy as sc
import statsmodels.api as sm
import matplotlib.pyplot as plt 
# Force matplolib to produce figures inline
%matplotlib inline
In [2]:
# Download the data and save it in a file pottery and open the file to read data into a variable
pottery_data=np.genfromtxt('pottery.txt', usecols=(0,4,5),dtype=[('Al','f8'),('Na','f8'),('site','S2')], skiprows=1)

# Find unique labels in site
labels=np.unique(pottery_data['site'])

# Create a list with data for each unique site
xdAl=[]; xdNa=[]
for label in labels:
    xdAl.append(pottery_data['Al'][np.nonzero(pottery_data['site']==label)[0]])
    xdNa.append(pottery_data['Na'][np.nonzero(pottery_data['site']==label)[0]])
    
#Plot the data for Al and Na oxides 
    
plt.subplot(1,2,1)
plt.boxplot(xdAl,labels=labels,sym='ro', whis=1.5);
plt.title('Al oxide concentration') 
plt.xlabel('Sites')  
plt.ylabel('Concentration (in %)')

plt.subplot(1,2,2)
plt.boxplot(xdNa,labels=labels,sym='ro', whis=1.5);
plt.title('Na oxide concentration') 
plt.xlabel('Sites')  
plt.ylabel('Concentration (in %)')


plt.tight_layout();

Problem No. 2

In [3]:
# Read the data 
delivery_time=sm.datasets.get_rdataset("delivery","robustbase")

# Plot the data
plt.scatter(delivery_time.data['distance'],delivery_time.data['delTime'],c='c',s=15*delivery_time.data['n.prod'])
plt.xlabel('Distance')
plt.ylabel('Delivery Time')
plt.title ('Time to Service')
plt.xlim([0,1600])
Out[3]:
(0, 1600)

Problem No. 3

Central Limit Theorem:

States that the distribution of the sum (or mean) of a large number of independent, identically distributed variables will be approximately normal, regardless of the underlying distribution.

Step. 1 Generate uniformly distributed random numbers.
In [4]:
ur=np.random.rand(10,11555000)
Step. 2 Plot their distribution.
In [5]:
plt.hist(ur[0,:],75,color='c',normed='True');
Step. 3 Calculate means of each of the multiple sequences of uniformly distributed random variables of length n i.e., every sequence consists of n random variables.
In [6]:
Xmu1=np.zeros((4,11555000));

# For n=2
Xmu1[0,:]=np.mean(ur[0:2],axis=0) 

# For n=4
Xmu1[1,:]=np.mean(ur[0:4],axis=0) 

# For n=8
Xmu1[2,:]=np.mean(ur[0:8],axis=0) 

# For n=10
Xmu1[3,:]=np.mean(ur[0:8],axis=0) 
Step 4. Plot distributions for different n as subplots.
In [7]:
plt.subplot(2,2, 1)
plt.hist(Xmu1[0,:],75,color='k',normed=True)
plt.title('n = 2') 


plt.subplot(2,2, 2)
plt.hist(Xmu1[1,:],75,color='k', normed=True)
plt.title('n = 4') 


plt.subplot(2,2, 3)
plt.hist(Xmu1[2,:],75,color='k', normed=True)
plt.title('n = 8') 


plt.subplot(2,2, 4)
plt.hist(Xmu1[2,:],75,color='k', normed=True)
plt.title('n = 10') 

plt.tight_layout()


plt.show()