# OLS intuition

# set seed for random number generation

set.seed(1234)

# Define number of individuals / sample size n1

n1 = 100

# Create normally distributed random variables

x1 = rnorm(n1,mean=1,sd=10)

epsilon1 = rnorm(n1,5,1)

# Define the true coefficient

beta_1 = 1

y1 = beta_1 * x1 + epsilon1

# lm() is the built-in procedure in R that calculates OLS

model1 = lm(y1~ x1)

# summary() returns some descriptive statistics

summary(model1)

plot(x1,y1)
abline(model1)

# Questions: What do you expect to happen if

# 1. We increase the variance of x1 or epsilon?
# 2. We choose mean(epsilon) != 0?
# 3. We increase the sample size, holding everything else fixed?


# New model, including an additional variable

n2 = 1000

x1 = rnorm(n2,0,3)

x2 = rnorm(n2,0,1)


# define the true values of the coefficients

beta_1 = 2

beta_2 = 4

epsilon2 = rnorm(n2,0,5) + x2^2

y2 = beta_1 * x1 + beta_2 * x2 + epsilon2

model2 = lm(y2~ x1 + x2 )

summary(model2)

plot(x1,y2)
abline(model2)


# We now take a similar model, this time with an error term that is 
# correlated with one of the regressors. What will happen???

n3 = 1000

x1 = rnorm(n3,2,3)

x2 = rnorm(n3,2,1)

# define the true values of the coefficients

beta_1 = 2

beta_2 = 4

epsilon3 = rnorm(n3,1,1) + x2^2

y3 = beta_1 * x1 + beta_2 * x2 + epsilon3

model3 = lm(y3~ x1 + x2)

summary(model3)