# Solutions for homework sheet1¶

### Problem No. 1¶

In [1]:
# Import all the packages
import numpy as np
import scipy as sc
import statsmodels.api as sm
import matplotlib.pyplot as plt
# Force matplolib to produce figures inline
%matplotlib inline

In [2]:
# Download the data and save it in a file pottery and open the file to read data into a variable
pottery_data=np.genfromtxt('pottery.txt', usecols=(0,4,5),dtype=[('Al','f8'),('Na','f8'),('site','S2')], skiprows=1)

# Find unique labels in site
labels=np.unique(pottery_data['site'])

# Create a list with data for each unique site
xdAl=[]; xdNa=[]
for label in labels:
xdAl.append(pottery_data['Al'][np.nonzero(pottery_data['site']==label)[0]])
xdNa.append(pottery_data['Na'][np.nonzero(pottery_data['site']==label)[0]])

#Plot the data for Al and Na oxides

plt.subplot(1,2,1)
plt.boxplot(xdAl,labels=labels,sym='ro', whis=1.5);
plt.title('Al oxide concentration')
plt.xlabel('Sites')
plt.ylabel('Concentration (in %)')

plt.subplot(1,2,2)
plt.boxplot(xdNa,labels=labels,sym='ro', whis=1.5);
plt.title('Na oxide concentration')
plt.xlabel('Sites')
plt.ylabel('Concentration (in %)')

plt.tight_layout();


### Problem No. 2¶

In [3]:
# Read the data
delivery_time=sm.datasets.get_rdataset("delivery","robustbase")

# Plot the data
plt.scatter(delivery_time.data['distance'],delivery_time.data['delTime'],c='c',s=15*delivery_time.data['n.prod'])
plt.xlabel('Distance')
plt.ylabel('Delivery Time')
plt.title ('Time to Service')
plt.xlim([0,1600])

Out[3]:
(0, 1600)

### Problem No. 3¶

#### Central Limit Theorem:¶

States that the distribution of the sum (or mean) of a large number of independent, identically distributed variables will be approximately normal, regardless of the underlying distribution.

###### Step. 1 Generate uniformly distributed random numbers.¶
In [4]:
ur=np.random.rand(10,11555000)

###### Step. 2 Plot their distribution.¶
In [5]:
plt.hist(ur[0,:],75,color='c',normed='True');

###### Step. 3 Calculate means of each of the multiple sequences of uniformly distributed random variables of length n i.e., every sequence consists of n random variables.¶
In [6]:
Xmu1=np.zeros((4,11555000));

# For n=2
Xmu1[0,:]=np.mean(ur[0:2],axis=0)

# For n=4
Xmu1[1,:]=np.mean(ur[0:4],axis=0)

# For n=8
Xmu1[2,:]=np.mean(ur[0:8],axis=0)

# For n=10
Xmu1[3,:]=np.mean(ur[0:8],axis=0)

###### Step 4. Plot distributions for different n as subplots.¶
In [7]:
plt.subplot(2,2, 1)
plt.hist(Xmu1[0,:],75,color='k',normed=True)
plt.title('n = 2')

plt.subplot(2,2, 2)
plt.hist(Xmu1[1,:],75,color='k', normed=True)
plt.title('n = 4')

plt.subplot(2,2, 3)
plt.hist(Xmu1[2,:],75,color='k', normed=True)
plt.title('n = 8')

plt.subplot(2,2, 4)
plt.hist(Xmu1[2,:],75,color='k', normed=True)
plt.title('n = 10')

plt.tight_layout()

plt.show()