piyush sachdeva - homework 5

Upload: lucas-triana

Post on 01-Jun-2018

224 views

Category:

Documents


0 download

TRANSCRIPT

  • 8/8/2019 piyush sachdeva - Homework 5

    1/48

    Lucas Triana

    Part 1

    Homework 5

    #loading the data bonddata = read.table("http://www.stat.cmu.edu/~cschafer/MSCF/bonddata.txt",

    sep=",", header=T)#creating the data frameattach(bonddata)newdata=as.data.frame(cbind(weight, current_coupon, time_to_maturity,

    reporting_delay, trade_size,curve_based_price, received_time_diff_last1,trade_price_last1,trade_size_last1,curve_based_price_last1,is_callable, trade_type, trade_type_last1,trade_price))

    #removing influential observation # 1457newdata=as.data.frame(newdata[-1457,])

    #initial histograms of weight, time_to_maturity, trade_size, trade_size_last1 hist(newdata$weight, main="Weight")

    hist(newdata$time_to_maturity, main="Time to maturity")

  • 8/8/2019 piyush sachdeva - Homework 5

    2/48

     

    hist(newdata$trade_size, main="Trade size")

    hist(newdata$trade_size_last1, main="Trade size last 1")

  • 8/8/2019 piyush sachdeva - Homework 5

    3/48

     

    #log transformation of the previous variables (to spread the data more evenly) log_weight=log(newdata$weight)hist(log_weight, main="log Weight")

  • 8/8/2019 piyush sachdeva - Homework 5

    4/48

  • 8/8/2019 piyush sachdeva - Homework 5

    5/48

     

    log_trade_size_last1=log(newdata$trade_size_last1)hist(log_trade_size_last1, main="log Trade size last 1")

    #transformation of reporting delay and received time diff last1 into a categorical variable categorical_reporting_delay=as.factor(cut(newdata$reporting_delay, c(-Inf,2,10,100,Inf)))

    categorical_received_time_diff_last1=as.factor(cut(newdata$received_time_diff_last1,c(-Inf,500,75000,4000000,Inf)))

    #fitting GAM from mgcv library(mgcv)

    ## Loading required package: nlme## This is mgcv 1.8-3. For overview type 'help("mgcv-package")'.

  • 8/8/2019 piyush sachdeva - Homework 5

    6/48

     

    #assembling the data frame with the transformed variables attach(newdata)

    ## The following objects are masked from bonddata:#### current_coupon, curve_based_price, curve_based_price_last1,## is_callable, received_time_diff_last1, reporting_delay,

    ## time_to_maturity, trade_price, trade_price_last1, trade_size,## trade_size_last1, trade_type, trade_type_last1, weight

    transformeddata=as.data.frame(cbind(log_weight, current_coupon, log_time_to_maturity,categorical_reporting_delay, log_trade_size,curve_based_price, categorical_received_time_diff

    _last1,trade_price_last1, log_trade_size_last1,curve_bas

    ed_price_last1,is_callable, trade_type, trade_type_last1, trade_

    price))

    #given that all variables in the data frame are treated as numeric the following is

    to specify factor to categorical variables transformeddata$categorical_reporting_delay=factor(transformeddata$categorical_reporting_delay)transformeddata$categorical_received_time_diff_last1=factor(transformeddata$categorical_received_time_diff_last1)transformeddata$is_callable=factor(transformeddata$is_callable)transformeddata$trade_type=factor(transformeddata$trade_type)transformeddata$trade_type_last1=factor(transformeddata$trade_type_last1)

  • 8/8/2019 piyush sachdeva - Homework 5

    7/48

    #fits the GAM  holdgam=gam(trade_price~s(log_weight)+s(current_coupon)+s(log_time_to_maturity)+

    categorical_reporting_delay+ s(log_trade_size)+s(curve_based_price)+categorical_received_time_diff_last1+ s(trade_price_last1)+s(log_trade_s

    ize_last1)+s(curve_based_price_last1)+is_callable+trade_type+trade_type_last1, data

    =transformeddata)

    summary(holdgam)

    #### Family: gaussian## Link function: identity#### Formula:## trade_price ~ s(log_weight) + s(current_coupon) + s(log_time_to_maturity) +## categorical_reporting_delay + s(log_trade_size) + s(curve_based_price) +## categorical_received_time_diff_last1 + s(trade_price_last1) +## s(log_trade_size_last1) + s(curve_based_price_last1) + is_callable +## trade_type + trade_type_last1

    #### Parametric coefficients:## Estimate Std. Error t value## (Intercept) 105.56558 0.22310 473.181## categorical_reporting_delay2 -0.23967 0.09514 -2.519## categorical_reporting_delay3 -0.29058 0.09462 -3.071## categorical_reporting_delay4 -0.42087 0.12867 -3.271## categorical_received_time_diff_last12 -0.33503 0.20980 -1.597## categorical_received_time_diff_last13 -0.55544 0.27039 -2.054## categorical_received_time_diff_last14 -1.47683 0.45408 -3.252## is_callable1 -0.18380 0.12913 -1.423## trade_type3 1.55252 0.09667 16.060## trade_type4 0.73781 0.09016 8.184

    ## trade_type_last13 -0.94159 0.09679 -9.728## trade_type_last14 -0.42791 0.09327 -4.588## Pr(>|t|)## (Intercept) < 2e-16 ***## categorical_reporting_delay2 0.01187 *## categorical_reporting_delay3 0.00217 **## categorical_reporting_delay4 0.00110 **## categorical_received_time_diff_last12 0.11048## categorical_received_time_diff_last13 0.04012 *## categorical_received_time_diff_last14 0.00117 **## is_callable1 0.15483## trade_type3 < 2e-16 ***## trade_type4 5.63e-16 ***

    ## trade_type_last13 < 2e-16 ***## trade_type_last14 4.83e-06 ***## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#### Approximate significance of smooth terms:## edf Ref.df F p-value## s(log_weight) 1.000 1.000 4.562 0.03284 *## s(current_coupon) 2.435 3.108 2.403 0.06374 .

  • 8/8/2019 piyush sachdeva - Homework 5

    8/48

    ## s(log_time_to_maturity) 5.231 6.444 1.751 0.09992 .## s(log_trade_size) 1.681 2.115 12.755 2.28e-06 ***## s(curve_based_price) 8.510 8.936 19.838 < 2e-16 ***## s(trade_price_last1) 4.846 6.217 278.604 < 2e-16 ***## s(log_trade_size_last1) 4.744 5.758 3.180 0.00497 **## s(curve_based_price_last1) 8.948 8.994 6.757 1.47e-09 ***

    ## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#### R-sq.(adj) = 0.985 Deviance explained = 98.5%## GCV = 2.0643 Scale est. = 2.0014 n = 1619

    #Predictions for trade_price for bonds of trade type 3 and 4 type3vs4pricedif=(holdgam$coefficient[10]-holdgam$coefficient[9])

    #plot of fitted relationships between continuous predictors and the response plot(holdgam, pages=4,scale=0, scheme=1)

  • 8/8/2019 piyush sachdeva - Homework 5

    9/48

     

  • 8/8/2019 piyush sachdeva - Homework 5

    10/48

     

    #residuals vs fitted values dev.off()

  • 8/8/2019 piyush sachdeva - Homework 5

    11/48

    ## null device## 1

    plot(as.numeric(holdgam$fitted.values), as.numeric(holdgam$residuals),pch=16,xlab="Fitted Values", ylab="Residuals",main="Fitted values vs. Residuals" ,cex.axis=1.3,cex.lab=1.3)

    abline(h = 0,lwd=2,col=4,lty=2)print("Analyzing the fitted values vs. the residuals one can argue that there is nostrong evidence of uneven variance across the data. Also it seems reasonable toassume that the data are spread around zero.")

    ## [1] "Analyzing the fitted values vs. the residuals one can argue that there is nostrong evidence of uneven variance across the data. Also it seems reasonable to assume that the data are spread around zero."

    #Actual response vs fitted values plot(as.numeric(holdgam$fitted.values), as.numeric(transformeddata$trade_price),

    pch=16,xlab="Fitted Values", ylab="response",main="Fitted values vs. response" ,cex.axis=1.3,cex.lab=1.3)

    abline(a=0,b=1,lwd=2,col=4,lty=1)

    print("Analyzing the fitted values vs. the response one can argue that the model is reasonably good in its predicting power given that the data are scattered around the perfect agreement line here plotted. This introduces some worries as the prediction istoo good which might indicate possible overffiting or spurious relationships betweenthe regressors and the response.")

    ## [1] "Analyzing the fitted values vs. the response one can argue that the model isreasonably good in its predicting power given that the data are scattered around theperfect agreement line here plotted. This introduces some worries as the prediction is too good which might indicate possible overffiting or spurious relationships between the regressors and the response."

    #qq plot qqnorm(as.numeric(holdgam$residuals),cex.axis=1.3,cex.lab=1.3,pch=16,main="QQ PLot")qqline(as.numeric(holdgam$residuals))print("There is clear evidence of heavy tails on both the lower and upper end of theplot, perhaps modifying the normality assumption would be advisable.")

    ## [1] "There is clear evidence of heavy tails on both the lower and upper end of theplot, perhaps modifying the normality assumption would be advisable."

    #fitting the linear modelholdlinear=gam(trade_price~log_weight+current_coupon+log_time_to_maturity+

    categorical_reporting_delay+log_trade_size+curve_based_price+categorical_received_time_diff_last1+trade_price_last1+log_trade_siz

    e_last1+curve_based_price_last1+is_callable+trade_type+trade_type_last1, data

    =transformeddata)

    summary(holdlinear)

    #### Family: gaussian## Link function: identity#### Formula:## trade_price ~ log_weight + current_coupon + log_time_to_maturity +

  • 8/8/2019 piyush sachdeva - Homework 5

    12/48

    ## categorical_reporting_delay + log_trade_size + curve_based_price +## categorical_received_time_diff_last1 + trade_price_last1 +## log_trade_size_last1 + curve_based_price_last1 + is_callable +## trade_type + trade_type_last1#### Parametric coefficients:

    ## Estimate Std. Error t value Pr(>|t|)## (Intercept) 1.24374 0.50380 2.469 0.013664## log_weight 0.12614 0.04974 2.536 0.011304## current_coupon 0.03085 0.02633 1.172 0.241572## log_time_to_maturity 0.03324 0.03924 0.847 0.397070## categorical_reporting_delay2 -0.18564 0.09816 -1.891 0.058791## categorical_reporting_delay3 -0.26398 0.09763 -2.704 0.006924## categorical_reporting_delay4 -0.40421 0.13214 -3.059 0.002258## log_trade_size 0.11996 0.02313 5.187 2.41e-07## curve_based_price 0.42527 0.03433 12.388 < 2e-16## categorical_received_time_diff_last12 -0.35606 0.21590 -1.649 0.099316## categorical_received_time_diff_last13 -0.63120 0.27888 -2.263 0.023748## categorical_received_time_diff_last14 -1.61833 0.46655 -3.469 0.000537## trade_price_last1 0.69842 0.01683 41.500 < 2e-16## log_trade_size_last1 -0.08395 0.02273 -3.693 0.000229## curve_based_price_last1 -0.13939 0.03589 -3.883 0.000107## is_callable1 -0.03317 0.11815 -0.281 0.778937## trade_type3 1.62292 0.09957 16.299 < 2e-16## trade_type4 0.81764 0.09287 8.804 < 2e-16## trade_type_last13 -0.89904 0.09972 -9.016 < 2e-16## trade_type_last14 -0.37862 0.09602 -3.943 8.39e-05#### (Intercept) *## log_weight *## current_coupon## log_time_to_maturity## categorical_reporting_delay2 .## categorical_reporting_delay3 **## categorical_reporting_delay4 **## log_trade_size ***## curve_based_price ***## categorical_received_time_diff_last12 .## categorical_received_time_diff_last13 *## categorical_received_time_diff_last14 ***## trade_price_last1 ***## log_trade_size_last1 ***## curve_based_price_last1 ***## is_callable1## trade_type3 ***## trade_type4 ***## trade_type_last13 ***## trade_type_last14 ***## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1###### R-sq.(adj) = 0.984 Deviance explained = 98.4%## GCV = 2.2 Scale est. = 2.1728 n = 1619

  • 8/8/2019 piyush sachdeva - Homework 5

    13/48

    plot(as.numeric(holdlinear$fitted.values), as.numeric(holdlinear$residuals),pch=16,xlab="Fitted Values", ylab="Residuals",main="Fitted values vs. Residuals" ,cex.axis=1.3,cex.lab=1.3)

    abline(h = 0,lwd=2,col=3,lty=2)print("Analyzing the fitted values vs. the residuals one can argue that there is no strong evidence of uneven variance across the data. Also it seems reasonable to assume

    that the data are spread around zero.")## [1] "Analyzing the fitted values vs. the residuals one can argue that there is nostrong evidence of uneven variance across the data. Also it seems reasonable to assume that the data are spread around zero."

    #Actual response vs fitted values plot(as.numeric(holdgam$fitted.values), as.numeric(transformeddata$trade_price),

    pch=16,xlab="Fitted Values", ylab="response",main="Fitted values vs. response" ,cex.axis=1.3,cex.lab=1.3)

    abline(a=0,b=1,lwd=2,col=3,lty=1)print("Analyzing the fitted values vs. the response one can argue that the model is reasonably good in its predicting power given that the data are scattered around the perfect agreement line here plotted. This introduces some worries as the prediction is

    too good which might indicate possible overffiting or spurious relationships betweenthe regressors and the response.")

    ## [1] "Analyzing the fitted values vs. the response one can argue that the model isreasonably good in its predicting power given that the data are scattered around theperfect agreement line here plotted. This introduces some worries as the prediction is too good which might indicate possible overffiting or spurious relationships between the regressors and the response."

    #qq plot qqnorm(as.numeric(holdgam$residuals),cex.axis=1.3,cex.lab=1.3,pch=4,main="QQ PLot")qqline(as.numeric(holdgam$residuals))print("There is clear evidence of heavy tails on both the lower and upper end of the

    plot, perhaps modifying the normality assumption would be advisable.")## [1] "There is clear evidence of heavy tails on both the lower and upper end of theplot, perhaps modifying the normality assumption would be advisable."

    #AIC comparison holdgam$aic

    ## [1] 5768.452

    holdlinear$aic

    ## [1] 5872.798

    print("Analyzing diagnostic plots for both models and their AIC values (which are dis

    tribution dependant and given that the qq plots are showing little reliability on thenormality assumption), one could argue that there is no justification in adding morecomplexity in the model given that the simple linear relationships seem to explain asmuch variability as in the complex model.")

    ## [1] "Analyzing diagnostic plots for both models and their AIC values (which are distribution dependent and given that the qq plots are showing little reliability on the normality assumption), one could argue that there is no justification in adding mor

  • 8/8/2019 piyush sachdeva - Homework 5

    14/48

    e complexity in the model given that the simple linear relationships seem to explainas much variability as in the complex model."

  • 8/8/2019 piyush sachdeva - Homework 5

    15/48

    Lucas Triana

    Part 2

    Homework 5

    market_symbols=read.table("http://www.stat.cmu.edu/~cschafer/MSCF/Project/ChallengeSy

    mbols2015.txt")

    #including the following predictors#^GSPC S&P500#^VIX CBOE Volatility Index

    #^VXN CBOE NASDAQ Volatility Index

    #^VXO CBOE S&P 100 Volatility Index

    my_symbols=read.table("mySymbols.txt")

    library(quantmod)

    market_symbols=as.vector(market_symbols$V1)

    my_symbols=as.vector(my_symbols$V1)

    #vectors that will store the volatility for the two months

    VOLMONTH1=seq(length(data))

    VOLMONTH2=seq(length(data))

    #vectors that will store the the price values and returns

    returns=data.frame(matrix(ncol = length(my_symbols)*4, nrow =

    length(market_symbols)))

    pricePerasset=data.frame(matrix(ncol = length(my_symbols)*4, nrow =

    length(market_symbols)))

    #loop that renames the columns for the data frames needed in the regression

    j=1

    k=1

    for (i in 1:length(my_symbols)*4)

    {

    colname=sprintf("%s_weekly return", my_symbols[k])name=sprintf("%s%d", "X", j)

    names(returns)[names(returns)==name]

  • 8/8/2019 piyush sachdeva - Homework 5

    16/48

      names(returns)[names(returns)==name]

  • 8/8/2019 piyush sachdeva - Homework 5

    17/48

      #price last month close

    pricePerasset[length(market_symbols)-j,6]=asset$VIX.Adjusted[nrow(asset)-21-j] #

    21 trading days

    #price last quarter close

    pricePerasset[length(market_symbols)-j,7]=asset$VIX.Adjusted[nrow(asset)-63-j]

    #63 trading days

    #price last year closepricePerasset[length(market_symbols)-j,8]=asset$VIX.Adjusted[nrow(asset)-252-j]

    #252 trading days

    #weekly returns

    returns[length(market_symbols)-

    j,5]=weeklyReturn(asset,type="log")[length(weeklyReturn(asset,type="log"))-j]#monthly returns

    returns[length(market_symbols)-

    j,6]=monthlyReturn(asset,type="log")[length(monthlyReturn(asset,type="log"))-j]

    #quarterly returns

    returns[length(market_symbols)-

    j,7]=quarterlyReturn(asset,type="log")[length(quarterlyReturn(asset,type="log"))-j]#yearly returns

    returns[length(market_symbols)-

    j,8]=quarterlyReturn(asset,type="log")[length(quarterlyReturn(asset,type="log"))-j]

    }

    #^VXN dataasset=getSymbols(my_symbols[3], from=(Sys.Date()-(7560)), to=(Sys.Date()-(30)),

    auto.assign=F)

    print(my_symbols[3])

    for(j in 0:(length(market_symbols)-1))

    {#price last week close

    pricePerasset[length(market_symbols)-j,9]=asset$VXN.Adjusted[nrow(asset)-5-j] #5

    trading days

    #price last month close

    pricePerasset[length(market_symbols)-j,10]=asset$VXN.Adjusted[nrow(asset)-21-j] #

    21 trading days#price last quarter close

    pricePerasset[length(market_symbols)-j,11]=asset$VXN.Adjusted[nrow(asset)-63-j]

    #63 trading days

    #price last year close

    pricePerasset[length(market_symbols)-j,12]=asset$VXN.Adjusted[nrow(asset)-252-j]

    #252 trading days

    #weekly returns

    returns[length(market_symbols)-

    j,9]=weeklyReturn(asset,type="log")[length(weeklyReturn(asset,type="log"))-j]

    #monthly returns

    returns[length(market_symbols)-

    j,10]=monthlyReturn(asset,type="log")[length(monthlyReturn(asset,type="log"))-j]

  • 8/8/2019 piyush sachdeva - Homework 5

    18/48

      #quarterly returns

    returns[length(market_symbols)-

    j,11]=quarterlyReturn(asset,type="log")[length(quarterlyReturn(asset,type="log"))-j]

    #yearly returns

    returns[length(market_symbols)-

    j,12]=quarterlyReturn(asset,type="log")[length(quarterlyReturn(asset,type="log"))-j]

    }

    #^VXO data

    asset=getSymbols(my_symbols[4], from=(Sys.Date()-(7560)), to=(Sys.Date()-(30)),

    auto.assign=F)

    print(my_symbols[4])

    for(j in 0:(length(market_symbols)-1))

    {

    #price last week close

    pricePerasset[length(market_symbols)-j,13]=asset$VXO.Adjusted[nrow(asset)-5-j] #5

    trading days

    #price last month closepricePerasset[length(market_symbols)-j,14]=asset$VXO.Adjusted[nrow(asset)-21-j] #

    21 trading days

    #price last quarter close

    pricePerasset[length(market_symbols)-j,15]=asset$VXO.Adjusted[nrow(asset)-63-j]

    #63 trading days

    #price last year closepricePerasset[length(market_symbols)-j,16]=asset$VXO.Adjusted[nrow(asset)-252-j]

    #252 trading days

    #weekly returnsreturns[length(market_symbols)-

    j,13]=weeklyReturn(asset,type="log")[length(weeklyReturn(asset,type="log"))-j]

    #monthly returns

    returns[length(market_symbols)-

    j,14]=monthlyReturn(asset,type="log")[length(monthlyReturn(asset,type="log"))-j]

    #quarterly returnsreturns[length(market_symbols)-

    j,15]=quarterlyReturn(asset,type="log")[length(quarterlyReturn(asset,type="log"))-j]

    #yearly returns

    returns[length(market_symbols)-

    j,16]=quarterlyReturn(asset,type="log")[length(quarterlyReturn(asset,type="log"))-j]

    }

    #Computes the daily returns and volatility of the 70 initial stocks for two months

    for (i in 1:length(market_symbols))

    {

    month1=getSymbols(market_symbols[i], from=(Sys.Date()-60), to=(Sys.Date()-30),

    auto.assign=F)

    DAILYRETURNS= dailyReturn(month1,type="log")

    volatilitymonth1=sqrt(sum(DAILYRETURNS^2)/length(DAILYRETURNS))

  • 8/8/2019 piyush sachdeva - Homework 5

    19/48

      VOLMONTH1[i]=volatilitymonth1

    month2=getSymbols(market_symbols[i], from=(Sys.Date()-32), to=Sys.Date(),

    auto.assign=F)

    DAILYRETURNS2= dailyReturn(month2,type="log")

    volatilitymonth2= sqrt(sum(DAILYRETURNS2^2)/length(DAILYRETURNS2))

    VOLMONTH2[i]=volatilitymonth2

    }

    data=data.frame(cbind(returns,pricePerasset,as.data.frame(VOLMONTH1),as.data.frame(VO

    LMONTH2)))

    noNAdata=na.omit(data) #removing N/A data

  • 8/8/2019 piyush sachdeva - Homework 5

    20/48

  • 8/8/2019 piyush sachdeva - Homework 5

    21/48

  • 8/8/2019 piyush sachdeva - Homework 5

    22/48

  • 8/8/2019 piyush sachdeva - Homework 5

    23/48

  • 8/8/2019 piyush sachdeva - Homework 5

    24/48

  • 8/8/2019 piyush sachdeva - Homework 5

    25/48

  • 8/8/2019 piyush sachdeva - Homework 5

    26/48

  • 8/8/2019 piyush sachdeva - Homework 5

    27/48

  • 8/8/2019 piyush sachdeva - Homework 5

    28/48

  • 8/8/2019 piyush sachdeva - Homework 5

    29/48

  • 8/8/2019 piyush sachdeva - Homework 5

    30/48

     #histogram drawing of the predictors and response to see if any transformations are

    needed

    for (i in 1:ncol(noNAdata)){hist(as.matrix(noNAdata[i]),main=colnames(noNAdata)[i],xlab=i)

    }

    #noticing that several columns of the data might need transformations

    coltotransform=c(3,4,17,19,21,22,23,24,25,26,27,29,30,31,32,33)

    skew=seq(length(coltotransform))

    #columns whose skewness is to be less seem to be largely positive or negative are tobe transformed:

    #two transformations will take place, log(x) (if all data are positive), x^2 or e^x

    depending on the skew

    library(moments)counter=0

    for(i in 1:length(coltotransform)){

    skew[i]=skewness(as.matrix(noNAdata[coltotransform[i]]),na.rm = FALSE)

  • 8/8/2019 piyush sachdeva - Homework 5

    31/48

      if(min(noNAdata[coltotransform[i]])0){

    temp=noNAdata[coltotransform[i]]noNAdata[coltotransform[i]]=log(noNAdata[coltotransform[i]])

    hist(as.matrix(noNAdata[coltotransform[i]]),main=colnames(noNAdata)[i],xlab=i)

    hist(as.matrix(temp),col=3,lwd=2,lty=2,main=colnames(noNAdata)[i],xlab=i,add=T)

    counter=counter+1

    print(counter)

    }}

  • 8/8/2019 piyush sachdeva - Homework 5

    32/48

  • 8/8/2019 piyush sachdeva - Homework 5

    33/48

  • 8/8/2019 piyush sachdeva - Homework 5

    34/48

  • 8/8/2019 piyush sachdeva - Homework 5

    35/48

  • 8/8/2019 piyush sachdeva - Homework 5

    36/48

  • 8/8/2019 piyush sachdeva - Homework 5

    37/48

     #cox-box transformation search for the response

    library(car)

    BC_Transformation=boxcox(VOLMONTH2~.,data=noNAdata)#finds the optimal value for lambda

    lambda=BC_Transformation$x[which(BC_Transformation$y==max(BC_Transformation$y))]

  • 8/8/2019 piyush sachdeva - Homework 5

    38/48

     #stepwise simple linear analysis

    attach(no.na.data)

    fullmod=lm(VOLMONTH2~.,data=noNAdata) #full modelsummary(fullmod)

    linearmod=step(fullmod, direction="both")

    summary(linearmod)Call:lm(formula = VOLMONTH2 ~ X.GSPC_weekly.return + X.VXN_weekly.return +

    X.VXN_monthly.return + X.VXO_monthly.return + X.GSPC_monthly.price +X.VIX_quarterly.price + X.VXN_yearly.price + X.VXO_monthly.price +VOLMONTH1, data = noNAdata)

    Residuals:Min 1Q Median 3Q Max

    -0.007720 -0.002627 0.000164 0.001879 0.009041

    Coefficients:Estimate Std. Error t value Pr(>|t|)

    (Intercept) 1.550e-01 5.348e-02 2.898 0.00564 **X.GSPC_weekly.return -1.538e-01 8.126e-02 -1.893 0.06439 .X.VXN_weekly.return -1.418e-02 9.224e-03 -1.538 0.13068X.VXN_monthly.return -1.194e-02 7.371e-03 -1.620 0.11170X.VXO_monthly.return 1.667e-02 6.542e-03 2.548 0.01409 *X.GSPC_monthly.price -5.809e-05 2.196e-05 -2.645 0.01102 *X.VIX_quarterly.price 7.347e-03 3.705e-03 1.983 0.05308 .X.VXN_yearly.price 3.345e-03 7.823e-04 4.275 9.02e-05 ***X.VXO_monthly.price -1.412e-02 6.311e-03 -2.237 0.02997 *VOLMONTH1 1.249e-02 9.805e-04 12.736 < 2e-16 ***---Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

    Residual standard error: 0.003814 on 48 degrees of freedomMultiple R-squared: 0.8028, Adjusted R-squared: 0.7658F-statistic: 21.71 on 9 and 48 DF, p-value: 4.576e-14

    #Influential observation search

    cookd=as.numeric(cooks.distance(linearmod))

    plot(cookd,xlab="Observation",ylab="Cook's Distance",main="Cook's Distance")

    lines(c(1,length(cookd)),c(4/length(cookd),4/length(cookd)),lwd=2,col=3,lty=2)

    -2 -1 0 1 2

    -140

    -60

    log-Likelih

    ood

     95%

  • 8/8/2019 piyush sachdeva - Homework 5

    39/48

     #residuals vs fitted values

    plot(as.numeric(linearmod$fitted.values),as.numeric(linearmod$residuals),

    pch=16,xlab="Fitted Values", ylab="Residuals",main="Fitted values vs. Residuals",cex.axis=1.3,cex.lab=1.3)

    abline(h = 0,lwd=2,col=3,lty=2)

    #residuals over time

    plot(linearmod$residuals, xlab="Time",ylab="Residuals", main="Evolution of residuals"

    ,cex.axis=1.3,cex.lab=1.3,pch=16)

    abline(h = 0,lwd=2,col=3,lty=2)

    0 10 20 30 40 50 600.00

    0.20

    Cook's Distance

    Observation

    Cook'sDis

    tance

    0.005 0.020-0.0

    05

    Fitted values vs. Residuals

    Fitted Values

    es

    uas

  • 8/8/2019 piyush sachdeva - Homework 5

    40/48

     #Actual response vs fitted values

    plot(as.numeric(linearmod$fitted.values),as.numeric(noNAdata$VOLMONTH2),

    pch=16,xlab="Fitted Values", ylab="response",main="Fitted values vs. response",cex.axis=1.3,cex.lab=1.3)

    abline(a=0,b=1,lwd=2,col=3,lty=1)

    #qq plot

    qqnorm(as.numeric(finalmod$residuals),cex.axis=1.3,cex.lab=1.3,pch=16,main="QQ PLot")

    qqline(as.numeric(finalmod$residuals))

    0 20 40 60-0.005

    Evolution of residuals

    Time

    esu

    as

    0.005 0.0200.0

    1

    Fitted values vs. response

    Fitted Values

    res

    ponse

  • 8/8/2019 piyush sachdeva - Homework 5

    41/48

     #fit gam

    library(mgcv)

    attributes(noNAdata)

    #noticing that there is a lack of data compared to the number of predictors and rows

    in the

    #data frame, those variables chosen by glm will be the ones included in gam

    #(data frame noNAdata of size 58x34)

    holdgam=gam(VOLMONTH2~s(X.GSPC_weekly.return)+(X.VXN_weekly.return)+s(X.VXN_monthly.r

    eturn)+

    (X.VXO_monthly.return)+s(X.GSPC_monthly.price)+s(X.VIX_quarterly.price)+

    (X.VXN_yearly.price)+s(X.VXO_monthly.price)+s(VOLMONTH1)

    ,data=noNAdata)

    summary(holdgam)Family: gaussianLink function: identity

    Formula:VOLMONTH2 ~ s(X.GSPC_weekly.return) + (X.VXN_weekly.return) +

    s(X.VXN_monthly.return) + (X.VXO_monthly.return) + s(X.GSPC_monthly.price) +

    s(X.VIX_quarterly.price) + (X.VXN_yearly.price) + s(X.VXO_monthly.price)+

    s(VOLMONTH1)

    Parametric coefficients:Estimate Std. Error t value Pr(>|t|)

    (Intercept) -0.0255803 0.0110534 -2.314 0.025176 *

    X.VXN_weekly.return -0.0076158 0.0091063 -0.836 0.407304X.VXO_monthly.return 0.0147098 0.0061787 2.381 0.021479 *X.VXN_yearly.price 0.0028573 0.0007523 3.798 0.000426 ***---Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

    Approximate significance of smooth terms:edf Ref.df F p-value

    s(X.GSPC_weekly.return) 1.000 1.000 1.922 0.1722s(X.VXN_monthly.return) 1.000 1.000 1.691 0.1999s(X.GSPC_monthly.price) 1.000 1.000 5.514 0.0231 *

    -2 -1 0 1 2-0.005

    QQ PLot

    Theoretical Quantilesampe

    u

    an

    es

  • 8/8/2019 piyush sachdeva - Homework 5

    42/48

    s(X.VIX_quarterly.price) 1.000 1.000 3.825 0.0565 .s(X.VXO_monthly.price) 1.905 2.342 3.399 0.0355 *s(VOLMONTH1) 2.128 2.616 73.487

  • 8/8/2019 piyush sachdeva - Homework 5

    43/48

     

    #residuals vs fitted values

    dev.off()plot(as.numeric(holdgam$fitted.values),as.numeric(holdgam$residuals),

    pch=16,xlab="Fitted Values", ylab="Residuals",main="Fitted values vs. Residuals"

    ,cex.axis=1.3,cex.lab=1.3)

    abline(h = 0,lwd=2,col=4,lty=2)

    #Actual response vs fitted values

    plot(as.numeric(holdgam$fitted.values),as.numeric(noNAdata$VOLMONTH2),

    pch=16,xlab="Fitted Values", ylab="response",main="Fitted values vs. response"

    ,cex.axis=1.3,cex.lab=1.3)abline(a=0,b=1,lwd=2,col=4,lty=1)

    2.4 3.0-0.010

    X.VXO_monthly.pris(X.VXO_monthly.price,1.9

    -5.5 -3.5-0.02

    0.01

    VOLMONTH1s(VOLMONTH1,2.13)

    0.005 0.020 0.035-0.006

    Fitted values vs. Residuals

    Fitted Values

    es

    uas

  • 8/8/2019 piyush sachdeva - Homework 5

    44/48

     #qq plot

    qqnorm(as.numeric(holdgam$residuals),cex.axis=1.3,cex.lab=1.3,pch=16,main="QQ PLot")

    qqline(as.numeric(holdgam$residuals))

    #fitting PPR

    holdppr=ppr(VOLMONTH2~(X.GSPC_weekly.return)+(X.VXN_weekly.return)+(X.VXN_monthly.ret

    urn)+

    (X.VXO_monthly.return)+(X.GSPC_monthly.price)+(X.VIX_quarterly.price)+

    (X.VXN_yearly.price)+(X.VXO_monthly.price)+(VOLMONTH1),nterms=2

    ,data=noNAdata,sm.method="gcvspline")

    summary(holdppr)Call:ppr(formula = VOLMONTH2 ~ (X.GSPC_weekly.return) + (X.VXN_weekly.return) +

    (X.VXN_monthly.return) + (X.VXO_monthly.return) + (X.GSPC_monthly.price)+

    (X.VIX_quarterly.price) + (X.VXN_yearly.price) + (X.VXO_monthly.price) +(VOLMONTH1), data = noNAdata, nterms = 2, sm.method = "gcvspline")

    Goodness of fit:

    0.005 0.020 0.0350.01

    Fitted values vs. response

    Fitted Values

    respon

    se

    -2 -1 0 1 2-0.006

    QQ PLot

    Theoretical Quantilesam

    pe

    uan

    es

  • 8/8/2019 piyush sachdeva - Homework 5

    45/48

  • 8/8/2019 piyush sachdeva - Homework 5

    46/48

     #qq plot

    qqnorm(as.numeric(holdppr$residuals),cex.axis=1.3,cex.lab=1.3,pch=16,main="QQ PLot")

    qqline(as.numeric(holdppr$residuals))

    #fit NNET

    library(nnet)

    holdnnet=nnet(VOLMONTH2~(X.GSPC_weekly.return)+(X.VXN_weekly.return)+(X.VXN_monthly.r

    eturn)+

    (X.VXO_monthly.return)+(X.GSPC_monthly.price)+(X.VIX_quarterly.price)+

    (X.VXN_yearly.price)+(X.VXO_monthly.price)+(VOLMONTH1),

    data=noNAdata,size=4, lineout=TRUE, decay=0.001, maxit=2000)summary(holdnnet)a 9-4-1 network with 45 weightsoptions were - decay=0.001b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1 i9->h10.00 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.00

    b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2 i9->h20.00 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.00

    b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3 i9->h30.00 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.00

    0.01 0.030.01

    Fitted values vs. response

    Fitted Values

    respon

    se

    -2 -1 0 1 2-0.003

    QQ PLot

    Theoretical Quantilesam

    pe

    uan

    es

  • 8/8/2019 piyush sachdeva - Homework 5

    47/48

     b->h4 i1->h4 i2->h4 i3->h4 i4->h4 i5->h4 i6->h4 i7->h4 i8->h4 i9->h40.00 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.00

    b->o h1->o h2->o h3->o h4->o-0.81 -0.81 -0.81 -0.81 -0.81

    #residuals vs fitted values

    plot(as.numeric(holdnnet$fitted.values),as.numeric(holdnnet$residuals),

    pch=16,xlab="Fitted Values", ylab="Residuals",main="Fitted values vs. Residuals",cex.axis=1.3,cex.lab=1.3)

    abline(h = 0,lwd=2,col=2,lty=2)

    #Actual response vs fitted values

    plot(as.numeric(holdnnet$fitted.values),as.numeric(noNAdata$VOLMONTH2),

    pch=16,xlab="Fitted Values", ylab="response",main="Fitted values vs. response"

    ,cex.axis=1.3,cex.lab=1.3)

    abline(a=0,b=1,lwd=2,col=2,lty=1)

    #qq plot

    qqnorm(as.numeric(holdppr$residuals),cex.axis=1.3,cex.lab=1.3,pch=16,main="QQ PLot")

    qqline(as.numeric(holdppr$residuals))

    0.01723805 0.01723830-0.0

    1

    Fitted values vs. Residuals

    Fitted Values

    es

    uas

    0.01723805 0.01723830

    0.01

    Fitted values vs. response

    Fitted Values

    response

  • 8/8/2019 piyush sachdeva - Homework 5

    48/48

     save(finalmod, file="LTRIANAL_models.Robj")

    -2 -1 0 1 2-0.003

    QQ PLot

    Theoretical Quantilesampe

    u

    an

    es