Friday, December 18, 2020

model summary in R

 # check that model is good fit or not

with(model, cbind(res.deviance = deviance, df = df.residual,

                   p = pchisq(deviance, df.residual, lower.tail=FALSE)))



## odds ratios and 95% CI ***********

exp(cbind(OR = coef(model), confint(model)))


# MORE SUMMARIES ###########################################


anova(model)            # Coefficients w/inferential tests


coef(model)             # Coefficients 

hist(coef(model))


confint(model)          # CI for coefficients

hist(confint(model))

resid(model)            # Residuals case-by-case

hist(residuals(model),main="model COVID 19" ) # Histogram of residuals

plot(residuals(model), main="model COVID 19" )


logLik(model)

BIC(model)

PseudoR2(model)

predict(model)

hist(predict(model))

#peseudo r square 

model$null.deviance

model$deviance

modelChi <- model$null.deviance - model$deviance

pseudo <- modelChi / model$null.deviance

pseudo

# Compute the pseudo p-value

Chidf <- model$df.null - model$df.residual

modelChi <- model$null.deviance - model$deviance

1 - pchisq(modelChi, Chidf)


#RSS(residual sum of square)

RSS <- c(crossprod(model$residuals))

RSS

#Mean square error

MSE <- RSS / length(model$residuals)

MSE

#Root MSE

RMSE <- sqrt(MSE)

RMSE

#Pearson estimated residual variance

sig2 <- RSS / model$df.residual

sig2

Poisson lognormal regression model in R

 library(PLNmodels)

library(ggplot2)

library(corrplot)



Y=read.csv(choose.files())

X=read.csv(choose.files())

c=list(Y,X)

names(c) <- c("output", "input")

d <- prepare_data(c$output, c$input)


model <- PLN(Abundance ~ 

             LANE_WIDTH               +

             LENGTH                    +

             LOG_AVG_AADT                 +

             LOG_PAVEMENT_CONDITION         +

             HIGH_FREQ_TRANSIT           +

             eightyfivepercent_SPEED_ADJ   +

             P_SPEEDING_ADJ               +

             SL                         +

             NL, data = d)



print(model)      #variational lower bound of the ICL

coef(model)       #mu -vectors of means of the latent variable

sigma(model)      #covariance matrix of the latent variable

vcov(model)       #Variance-Covariance Matrix

fitted(model)

standard_error(model)

plot(fitted(model))

barplot(fitted(model))


#2 tailed z test

z<- coef(model)/standard_error(model)

p<- (1-pnorm(abs(z),0,1))*2

p        #pvalue



Friday, November 13, 2020

Poisson Model

 # Step 1 - understand data

# Load data

p <- read.csv(file.choose())


head(p)


# STEP 2- EFA(exploratory factor analysis)


#summary of variables

summary(p)


#variance

var(p)



# Dependent variable plot


hist(p$SPEEDING_CRASH,

     main = "Histogram of Speeding Crash",

     xlab = "Speeding Crash Number",

     ylab = "Frequency")



#STEP 3- Poisson regression model


model <-glm(formula = SPEEDING_CRASH  ~ SL+NL+P_TRUCK+AVG_AADT+SIGNAL_NUM+SIGNAL_PER_MILE +STOPPED_NUM +STOPPED_PER_MILE 

             +MEDIAN_WIDTH +MEDIAN_TYPE +P_RESIDENTIAL +P_COMMERCIAL +P_INDUSTRIAL +P_AGRICULTURAL +P_INSTITUTIONAL+

               P_GOVERNMENTAL +LANDUSE_MIX+POP_DEN +DAILY_TRANSIT +ISLDTYPE+ISLDWIDTH+OSLDTYPE +OSLDWIDTH +

               SURF_TYPE +SURFACE_WIDTH +LANE_WIDTH +POLES_PER_MILE +P_SINGLE_SIDEWALK_ONLY +P_BOTH_SIDEWALK_ONLY +

               SIDEWALK_WIDTH + P_SINGLE_BIKELANE_ONLY +P_BOTH_BIKELANE_ONLY +P_SINGLE_BIKESLOT_ONLY +P_BOTH_BIKESLOT_ONLY +

               P_SINGLE_SHAREDPATH_ONLY +P_BOTH_SHAREDPATH_ONLY +SHAREDPATH_WIDTH +NARROW_LANE +ASPHALT_SURFACE +

               PRESENCE_OF_MEDIAN_ISL_AT_CROSS+HIGH_FREQ_TRANSIT+PCT_POV+PCT_VEHICLE_0+PCT_OLD_YOUNG+PCT_MEANS_PEDBIC+

               PAVEMENT_CONDITION   +SHAREDPATH_DISTFMRD +FRQ_AM78+MEDIAN_TYPE +ISLDTYPE+OSLDTYPE+SURF_TYPE , 

             data = p, 

             family = poisson)


print(summary(model))



# robust std error calculation


library(sandwich)

cov.model <- vcovHC(model, type="HC0")    #HC0:heteroskedasticity-robust standard error

std.err <- sqrt(diag(cov.model))

robust <- cbind(Estimate= coef(model), "Robust Std Err" = std.err,

                "Pr(>|z|)" = 2 * pnorm(abs(coef(model)/std.err), lower.tail=FALSE),

                LL = coef(model) - 1.96 * std.err,

                UL = coef(model) + 1.96 * std.err)  #95% confidence level


robust



# check that model is good fit or not

with(model, cbind(res.deviance = deviance, df = df.residual,

                   p = pchisq(deviance, df.residual, lower.tail=FALSE)))


# MORE SUMMARIES ###########################################


anova(model)            # Coefficients w/inferential tests

coef(model)             # Coefficients 

confint(model)          # CI for coefficients

resid(model)            # Residuals case-by-case

hist(residuals(model) ) # Histogram of residuals




# CLEAN UP #################################################


# Clear environment

rm(list = ls()) 


# Clear packages

p_unload(all)  # Remove all add-ons

detach("package:datasets", unload = TRUE)  # For base


# Clear plots

dev.off()  # But only if there IS a plot


# Clear console

cat("\014")  # ctrl+L


# Clear mind :)

Probit Model

 require(aod)

require(ggplot2)


mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")


## convert rank to a factor (categorical variable)

mydata$rank <- factor(mydata$rank)


## view first few rows


head(mydata)


summary(mydata)


xtabs(~rank + admit, data = mydata) #


myprobit <- glm(admit ~ gre + gpa + rank, family = binomial(link = "probit"), 

                data = mydata)


## model summary

summary(myprobit)


confint(myprobit)


wald.test(b = coef(myprobit), Sigma = vcov(myprobit), Terms = 4:6)


l <- cbind(0, 0, 0, 1, -1, 0) #FOR 4:6 IS 6 VALUE TAKEN

wald.test(b = coef(myprobit), Sigma = vcov(myprobit), L = l)


newdata <- data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100), 

                                4 * 4), gpa = rep(c(2.5, 3, 3.5, 4), each = 100 * 4), rank = factor(rep(rep(1:4, 

                                                                                                            each = 100), 4)))


head(newdata)


newdata[, c("p", "se")] <- predict(myprobit, newdata, type = "response", se.fit = TRUE)[-3]


ggplot(newdata, aes(x = gre, y = p, colour = rank)) + geom_line() + facet_wrap(~gpa)


## change in deviance

with(myprobit, null.deviance - deviance)


## change in degrees of freedom

with(myprobit, df.null - df.residual)


## chi square test p-value

with(myprobit, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))


logLik(myprobit)


Logistic Model

 #Coded by Tawkir Ahmed 

library(ggplot2) # USed for plotting data

library(dplyr) # Used to extract columns in the data

library(rms) # Used to extract p-value from logistic model

library(aod)


theme_set(theme_gray() ) # the default

# logistic model

covid <- read.csv(choose.files())

labs <- attributes(covid)$labels


summary(covid)


# collapse all missing values to NA

covid$x <- factor(covid$x, levels=c("1", "2", "3", "4", "5", "6", "7"))

# run our regression model

before <- glm(x~ x1+ x2+ x3+ x4+

                 x5+ x6, 

               data=covid, family="binomial")

summary(before)


# check that model is good fit or not

with(before, cbind(res.deviance = deviance, df = df.residual,

                   p = pchisq(deviance, df.residual, lower.tail=FALSE)))



## odds ratios and 95% CI ***********

exp(cbind(OR = coef(before), confint(before)))


# MORE SUMMARIES ###########################################


anova(before)            # Coefficients w/inferential tests


coef(before)             # Coefficients 

hist(coef(before))


confint(before)          # CI for coefficients

hist(confint(before))

resid(before)            # Residuals case-by-case

hist(residuals(before),main="Before COVID 19" ) # Histogram of residuals

plot(residuals(before), main="Before COVID 19" )


logLik(before)

BIC(before)

PseudoR2(before)

predict(before)

hist(predict(before))

#peseudo r square 

before$null.deviance

before$deviance

modelChi <- before$null.deviance - before$deviance

pseudo <- modelChi / before$null.deviance

pseudo

# Compute the pseudo p-value

Chidf <- before$df.null - before$df.residual

modelChi <- before$null.deviance - before$deviance

1 - pchisq(modelChi, Chidf)


#RSS(residual sum of square)

RSS <- c(crossprod(before$residuals))

RSS

#Mean square error

MSE <- RSS / length(before$residuals)

MSE

#Root MSE

RMSE <- sqrt(MSE)

RMSE

#Pearson estimated residual variance

sig2 <- RSS / before$df.residual

sig2


# CLEAN UP #################################################

plot(before)  #plot 4 times to get four graph



library(effects)

x1 <- predictorEffect("x6", before)

plot(x1)


x2 <- predictorEffect("x2", before) 

plot(x2, main="Before COVID 19",xlab="X2", ylab="x")  #single plot


plot(allEffects(before)) #all plot


Negative Binomial Model

 library(betareg)

a <- read.csv(choose.files())

# beta regression model

summary(betareg(P_SPEEDING_ADJ ~ log(AVG_AADT), data=a))


#negative binomial model

require(foreign)

require(ggplot2)

require(MASS)

b<- a$P_SPEEDING_ADJ

c<- a$AVG_AADT

d<- log(c)

SPEEDING_AADT<- b*d


model1= glm.nb(formula=SPEEDING_CRASH ~SPEEDING_AADT  , 

               data = a)


print(summary(model1))

Sunday, September 27, 2020

Neural Network in R

 library(neuralnet)


# creating training data set

TAWKIR=c(20,10,30,20,80,30)

AHMED=c(90,20,40,50,50,80)

Placed=c(1,0,0,0,1,1)

# Here, you will combine multiple columns or features into a single set of data

df=data.frame(TAWKIR,AHMED,Placed)


# load library

require(neuralnet)


# fit neural network

nn=neuralnet(Placed~TAWKIR+AHMED,data=df, hidden=3,act.fct = "logistic",

             linear.output = FALSE)

# plot neural network

plot(nn)




Saturday, September 26, 2020

Friday, September 18, 2020

Random Forest plot

 library(randomForest)

# Load the dataset and explore

data1 <- read.csv(file.choose(), header = TRUE)


head(data1)


str(data1)


summary(data1)



# Split into Train and Validation sets

# Training Set : Validation Set = 70 : 30 (random)

set.seed(1000)

train <- sample(nrow(data1), 0.7*nrow(data1), replace = FALSE)

TrainSet <- data1[train,]

ValidSet <- data1[-train,]

summary(TrainSet)

summary(ValidSet)


# Create a Random Forest model with default parameters

model1 <- randomForest(SPEEDING_CRASH ~ ., data = TrainSet, importance = TRUE, family =negative.binomial)

model1


# or

# Fine tuning parameters of Random Forest model

model2 <- randomForest(SPEEDING_CRASH ~ ., data = TrainSet, ntree = 500, mtry = 6, importance = TRUE, objective = "reg:negative.binomial")

model2


# Predicting on train set

predTrain <- predict(model2, TrainSet, type = "class")

# Checking classification accuracy

#table(predTrain, TrainSet$SPEEDING_CRASH)  


#mean(predValid == ValidSet$SPEEDING_CRASH)  


# To check important variables

importance(model2)        

varImpPlot(model2)

#OR


importance(model1)        

varImpPlot(model1)




SHAP plot

 #Part 1: library input

suppressPackageStartupMessages({

  library(SHAPforxgboost)

  library(xgboost)

  library(data.table)

  library(ggplot2)

})


#part 2:

#file load and shap value calculation

a <- read.csv(file.choose())

X1 = as.matrix(a[,-1])

mod1 = xgboost::xgboost(

  data = X1, label = a$SPEEDING_CRASH, gamma = 0, eta = 1, 

  lambda = 0,nrounds = 10, verbose = F, objective = "reg:squarederror")


# shap.values(model, X_dataset) returns the SHAP

# data matrix and ranked features by mean|SHAP|

shap_values <- shap.values(xgb_model = mod1, X_train = X1)

shap_values$mean_shap_score



#part 3:

shap_values_iris <- shap_values$shap_score


# shap.prep() returns the long-format SHAP data from either model or

shap_long_iris <- shap.prep(xgb_model = mod1, X_train = X1)

# is the same as: using given shap_contrib

shap_long_iris <- shap.prep(shap_contrib = shap_values_iris, X_train = X1)


# -------------------------------------------------------------------------



shap.plot.summary(shap_long_iris)


# option of dilute is offered to make plot faster if there are over thousands of observations

# please see documentation for details. 


shap.plot.summary(shap_long_iris, x_bound  = 1.5, dilute = 10)

#end...................................



Text plot

 ## A blank plot to set up a coordinate system

## Final result will be Figure 3-42
> plot(0:10, 0:10, type = "n")
## Some regular text as a baseline
text(2,10, "Regular text", pos = 4)
## Set text larger and use serif family
par(list(cex = 2, family = "serif"))
## Add some text
text(2,8, "Serif Family", pos = 4)
## Alter rotation and set sans serif family
par(list(srt = 180, family = "sans"))
## Add some text note pos is opposite to previous
text(2,6, "Sans Family", pos = 2)
## Alter rotation and set monospace family
par(list(srt = 90, family = "mono"))
## Add some text
text(8,6, "Monospace Family")
## Reset parameters
par(list(cex = 1, srt = 0, family = ""))
## Create multi-line text
text(2,4, "Multi-line\ntext with\ndefault spacing", pos = 4)
## Alter line height
par(lheight = 2)
## More multi-line text
text(4, 2, "Multi-line\ntext with\ncustom spacing", pos = 4)
## Reset line height
par(lheight = 1)
  



Bi Plot for PCA

 ## Use datasets:USArrests

data(USArrests) # Get datafile

names(USArrests) # View variable names



## Scaled PCA using entire data.frame

pca1 = prcomp(USArrests, scale = TRUE)


## Both following commands produce same PCA as previous

pca2 = prcomp(~., data = USArrests, scale = TRUE)

pca3 = prcomp(~ Murder + Assault + Rape + UrbanPop,

                data = USArrests, scale = TRUE)


pca1 # View result


names(pca1) # View elements in result object


summary(pca1) # Summary


## Plots for results...

## Scree-plot of variances (Figure 2-5)

#plot(pca1, type = "lines", main = "PCA for USArrests")


## Bi-plot of result (Figure 2-6)

biplot(pca1, col = 1, cex = c(0.8, 1.2), expand = 0.9)




Scree plot from PCA

 ## Use datasets:USArrests

data(USArrests) # Get datafile

names(USArrests) # View variable names


## Scaled PCA using entire data.frame

pca1 = prcomp(USArrests, scale = TRUE)


## Both following commands produce same PCA as previous

pca2 = prcomp(~., data = USArrests, scale = TRUE)

pca3 = prcomp(~ Murder + Assault + Rape + UrbanPop,

                data = USArrests, scale = TRUE)


pca1 # View result


names(pca1) # View elements in result object


summary(pca1) # Summary


## Plots for results...

## Scree-plot of variances 

plot(pca1, type = "lines", main = "PCA for USArrests")




Monday, September 14, 2020

Tuesday, September 8, 2020