library(effects)
x1 <- predictorEffect("x6", model)
plot(x1)
x2 <- predictorEffect("x2", model)
plot(x2, main="model COVID 19",xlab="X2", ylab="x") #single plot
plot(allEffects(model)) #all plot
library(effects)
x1 <- predictorEffect("x6", model)
plot(x1)
x2 <- predictorEffect("x2", model)
plot(x2, main="model COVID 19",xlab="X2", ylab="x") #single plot
plot(allEffects(model)) #all plot
# check that model is good fit or not
with(model, cbind(res.deviance = deviance, df = df.residual,
p = pchisq(deviance, df.residual, lower.tail=FALSE)))
## odds ratios and 95% CI ***********
exp(cbind(OR = coef(model), confint(model)))
# MORE SUMMARIES ###########################################
anova(model) # Coefficients w/inferential tests
coef(model) # Coefficients
hist(coef(model))
confint(model) # CI for coefficients
hist(confint(model))
resid(model) # Residuals case-by-case
hist(residuals(model),main="model COVID 19" ) # Histogram of residuals
plot(residuals(model), main="model COVID 19" )
logLik(model)
BIC(model)
PseudoR2(model)
predict(model)
hist(predict(model))
#peseudo r square
model$null.deviance
model$deviance
modelChi <- model$null.deviance - model$deviance
pseudo <- modelChi / model$null.deviance
pseudo
# Compute the pseudo p-value
Chidf <- model$df.null - model$df.residual
modelChi <- model$null.deviance - model$deviance
1 - pchisq(modelChi, Chidf)
#RSS(residual sum of square)
RSS <- c(crossprod(model$residuals))
RSS
#Mean square error
MSE <- RSS / length(model$residuals)
MSE
#Root MSE
RMSE <- sqrt(MSE)
RMSE
#Pearson estimated residual variance
sig2 <- RSS / model$df.residual
sig2
library(PLNmodels)
library(ggplot2)
library(corrplot)
Y=read.csv(choose.files())
X=read.csv(choose.files())
c=list(Y,X)
names(c) <- c("output", "input")
d <- prepare_data(c$output, c$input)
model <- PLN(Abundance ~
LANE_WIDTH +
LENGTH +
LOG_AVG_AADT +
LOG_PAVEMENT_CONDITION +
HIGH_FREQ_TRANSIT +
eightyfivepercent_SPEED_ADJ +
P_SPEEDING_ADJ +
SL +
NL, data = d)
print(model) #variational lower bound of the ICL
coef(model) #mu -vectors of means of the latent variable
sigma(model) #covariance matrix of the latent variable
vcov(model) #Variance-Covariance Matrix
fitted(model)
standard_error(model)
plot(fitted(model))
barplot(fitted(model))
#2 tailed z test
z<- coef(model)/standard_error(model)
p<- (1-pnorm(abs(z),0,1))*2
p #pvalue
#2 tailed z test
z<- coef(model)/standard_error(model)
p<- (1-pnorm(abs(z),0,1))*2
p #pvalue
library(betareg)
a <- read.csv(choose.files())
# beta regression model
summary(betareg(P_SPEEDING_ADJ ~ log(AVG_AADT), data=a))
# Step 1 - understand data
# Load data
p <- read.csv(file.choose())
head(p)
# STEP 2- EFA(exploratory factor analysis)
#summary of variables
summary(p)
#variance
var(p)
# Dependent variable plot
hist(p$SPEEDING_CRASH,
main = "Histogram of Speeding Crash",
xlab = "Speeding Crash Number",
ylab = "Frequency")
#STEP 3- Poisson regression model
model <-glm(formula = SPEEDING_CRASH ~ SL+NL+P_TRUCK+AVG_AADT+SIGNAL_NUM+SIGNAL_PER_MILE +STOPPED_NUM +STOPPED_PER_MILE
+MEDIAN_WIDTH +MEDIAN_TYPE +P_RESIDENTIAL +P_COMMERCIAL +P_INDUSTRIAL +P_AGRICULTURAL +P_INSTITUTIONAL+
P_GOVERNMENTAL +LANDUSE_MIX+POP_DEN +DAILY_TRANSIT +ISLDTYPE+ISLDWIDTH+OSLDTYPE +OSLDWIDTH +
SURF_TYPE +SURFACE_WIDTH +LANE_WIDTH +POLES_PER_MILE +P_SINGLE_SIDEWALK_ONLY +P_BOTH_SIDEWALK_ONLY +
SIDEWALK_WIDTH + P_SINGLE_BIKELANE_ONLY +P_BOTH_BIKELANE_ONLY +P_SINGLE_BIKESLOT_ONLY +P_BOTH_BIKESLOT_ONLY +
P_SINGLE_SHAREDPATH_ONLY +P_BOTH_SHAREDPATH_ONLY +SHAREDPATH_WIDTH +NARROW_LANE +ASPHALT_SURFACE +
PRESENCE_OF_MEDIAN_ISL_AT_CROSS+HIGH_FREQ_TRANSIT+PCT_POV+PCT_VEHICLE_0+PCT_OLD_YOUNG+PCT_MEANS_PEDBIC+
PAVEMENT_CONDITION +SHAREDPATH_DISTFMRD +FRQ_AM78+MEDIAN_TYPE +ISLDTYPE+OSLDTYPE+SURF_TYPE ,
data = p,
family = poisson)
print(summary(model))
# robust std error calculation
library(sandwich)
cov.model <- vcovHC(model, type="HC0") #HC0:heteroskedasticity-robust standard error
std.err <- sqrt(diag(cov.model))
robust <- cbind(Estimate= coef(model), "Robust Std Err" = std.err,
"Pr(>|z|)" = 2 * pnorm(abs(coef(model)/std.err), lower.tail=FALSE),
LL = coef(model) - 1.96 * std.err,
UL = coef(model) + 1.96 * std.err) #95% confidence level
robust
# check that model is good fit or not
with(model, cbind(res.deviance = deviance, df = df.residual,
p = pchisq(deviance, df.residual, lower.tail=FALSE)))
# MORE SUMMARIES ###########################################
anova(model) # Coefficients w/inferential tests
coef(model) # Coefficients
confint(model) # CI for coefficients
resid(model) # Residuals case-by-case
hist(residuals(model) ) # Histogram of residuals
# CLEAN UP #################################################
# Clear environment
rm(list = ls())
# Clear packages
p_unload(all) # Remove all add-ons
detach("package:datasets", unload = TRUE) # For base
# Clear plots
dev.off() # But only if there IS a plot
# Clear console
cat("\014") # ctrl+L
# Clear mind :)
require(aod)
require(ggplot2)
mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
## convert rank to a factor (categorical variable)
mydata$rank <- factor(mydata$rank)
## view first few rows
head(mydata)
summary(mydata)
xtabs(~rank + admit, data = mydata) #
myprobit <- glm(admit ~ gre + gpa + rank, family = binomial(link = "probit"),
data = mydata)
## model summary
summary(myprobit)
confint(myprobit)
wald.test(b = coef(myprobit), Sigma = vcov(myprobit), Terms = 4:6)
l <- cbind(0, 0, 0, 1, -1, 0) #FOR 4:6 IS 6 VALUE TAKEN
wald.test(b = coef(myprobit), Sigma = vcov(myprobit), L = l)
newdata <- data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100),
4 * 4), gpa = rep(c(2.5, 3, 3.5, 4), each = 100 * 4), rank = factor(rep(rep(1:4,
each = 100), 4)))
head(newdata)
newdata[, c("p", "se")] <- predict(myprobit, newdata, type = "response", se.fit = TRUE)[-3]
ggplot(newdata, aes(x = gre, y = p, colour = rank)) + geom_line() + facet_wrap(~gpa)
## change in deviance
with(myprobit, null.deviance - deviance)
## change in degrees of freedom
with(myprobit, df.null - df.residual)
## chi square test p-value
with(myprobit, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
logLik(myprobit)
#Coded by Tawkir Ahmed
library(ggplot2) # USed for plotting data
library(dplyr) # Used to extract columns in the data
library(rms) # Used to extract p-value from logistic model
library(aod)
theme_set(theme_gray() ) # the default
# logistic model
covid <- read.csv(choose.files())
labs <- attributes(covid)$labels
summary(covid)
# collapse all missing values to NA
covid$x <- factor(covid$x, levels=c("1", "2", "3", "4", "5", "6", "7"))
# run our regression model
before <- glm(x~ x1+ x2+ x3+ x4+
x5+ x6,
data=covid, family="binomial")
summary(before)
# check that model is good fit or not
with(before, cbind(res.deviance = deviance, df = df.residual,
p = pchisq(deviance, df.residual, lower.tail=FALSE)))
## odds ratios and 95% CI ***********
exp(cbind(OR = coef(before), confint(before)))
# MORE SUMMARIES ###########################################
anova(before) # Coefficients w/inferential tests
coef(before) # Coefficients
hist(coef(before))
confint(before) # CI for coefficients
hist(confint(before))
resid(before) # Residuals case-by-case
hist(residuals(before),main="Before COVID 19" ) # Histogram of residuals
plot(residuals(before), main="Before COVID 19" )
logLik(before)
BIC(before)
PseudoR2(before)
predict(before)
hist(predict(before))
#peseudo r square
before$null.deviance
before$deviance
modelChi <- before$null.deviance - before$deviance
pseudo <- modelChi / before$null.deviance
pseudo
# Compute the pseudo p-value
Chidf <- before$df.null - before$df.residual
modelChi <- before$null.deviance - before$deviance
1 - pchisq(modelChi, Chidf)
#RSS(residual sum of square)
RSS <- c(crossprod(before$residuals))
RSS
#Mean square error
MSE <- RSS / length(before$residuals)
MSE
#Root MSE
RMSE <- sqrt(MSE)
RMSE
#Pearson estimated residual variance
sig2 <- RSS / before$df.residual
sig2
# CLEAN UP #################################################
plot(before) #plot 4 times to get four graph
library(effects)
x1 <- predictorEffect("x6", before)
plot(x1)
x2 <- predictorEffect("x2", before)
plot(x2, main="Before COVID 19",xlab="X2", ylab="x") #single plot
plot(allEffects(before)) #all plot
library(betareg)
a <- read.csv(choose.files())
# beta regression model
summary(betareg(P_SPEEDING_ADJ ~ log(AVG_AADT), data=a))
#negative binomial model
require(foreign)
require(ggplot2)
require(MASS)
b<- a$P_SPEEDING_ADJ
c<- a$AVG_AADT
d<- log(c)
SPEEDING_AADT<- b*d
model1= glm.nb(formula=SPEEDING_CRASH ~SPEEDING_AADT ,
data = a)
print(summary(model1))
library(neuralnet)
# creating training data set
TAWKIR=c(20,10,30,20,80,30)
AHMED=c(90,20,40,50,50,80)
Placed=c(1,0,0,0,1,1)
# Here, you will combine multiple columns or features into a single set of data
df=data.frame(TAWKIR,AHMED,Placed)
# load library
require(neuralnet)
# fit neural network
nn=neuralnet(Placed~TAWKIR+AHMED,data=df, hidden=3,act.fct = "logistic",
linear.output = FALSE)
# plot neural network
plot(nn)
plot(rf,$a xlib= " ", ylib=" ", main = "Error in different trees")
library(randomForest)
# Load the dataset and explore
data1 <- read.csv(file.choose(), header = TRUE)
head(data1)
str(data1)
summary(data1)
# Split into Train and Validation sets
# Training Set : Validation Set = 70 : 30 (random)
set.seed(1000)
train <- sample(nrow(data1), 0.7*nrow(data1), replace = FALSE)
TrainSet <- data1[train,]
ValidSet <- data1[-train,]
summary(TrainSet)
summary(ValidSet)
# Create a Random Forest model with default parameters
model1 <- randomForest(SPEEDING_CRASH ~ ., data = TrainSet, importance = TRUE, family =negative.binomial)
model1
# or
# Fine tuning parameters of Random Forest model
model2 <- randomForest(SPEEDING_CRASH ~ ., data = TrainSet, ntree = 500, mtry = 6, importance = TRUE, objective = "reg:negative.binomial")
model2
# Predicting on train set
predTrain <- predict(model2, TrainSet, type = "class")
# Checking classification accuracy
#table(predTrain, TrainSet$SPEEDING_CRASH)
#mean(predValid == ValidSet$SPEEDING_CRASH)
# To check important variables
importance(model2)
varImpPlot(model2)
#OR
importance(model1)
varImpPlot(model1)
#Part 1: library input
suppressPackageStartupMessages({
library(SHAPforxgboost)
library(xgboost)
library(data.table)
library(ggplot2)
})
#part 2:
#file load and shap value calculation
a <- read.csv(file.choose())
X1 = as.matrix(a[,-1])
mod1 = xgboost::xgboost(
data = X1, label = a$SPEEDING_CRASH, gamma = 0, eta = 1,
lambda = 0,nrounds = 10, verbose = F, objective = "reg:squarederror")
# shap.values(model, X_dataset) returns the SHAP
# data matrix and ranked features by mean|SHAP|
shap_values <- shap.values(xgb_model = mod1, X_train = X1)
shap_values$mean_shap_score
#part 3:
shap_values_iris <- shap_values$shap_score
# shap.prep() returns the long-format SHAP data from either model or
shap_long_iris <- shap.prep(xgb_model = mod1, X_train = X1)
# is the same as: using given shap_contrib
shap_long_iris <- shap.prep(shap_contrib = shap_values_iris, X_train = X1)
# -------------------------------------------------------------------------
shap.plot.summary(shap_long_iris)
# option of dilute is offered to make plot faster if there are over thousands of observations
# please see documentation for details.
shap.plot.summary(shap_long_iris, x_bound = 1.5, dilute = 10)
#end...................................
## A blank plot to set up a coordinate system
## Final result will be Figure 3-42## Use datasets:USArrests
data(USArrests) # Get datafile
names(USArrests) # View variable names
## Scaled PCA using entire data.frame
pca1 = prcomp(USArrests, scale = TRUE)
## Both following commands produce same PCA as previous
pca2 = prcomp(~., data = USArrests, scale = TRUE)
pca3 = prcomp(~ Murder + Assault + Rape + UrbanPop,
data = USArrests, scale = TRUE)
pca1 # View result
names(pca1) # View elements in result object
summary(pca1) # Summary
## Plots for results...
## Scree-plot of variances (Figure 2-5)
#plot(pca1, type = "lines", main = "PCA for USArrests")
## Bi-plot of result (Figure 2-6)
biplot(pca1, col = 1, cex = c(0.8, 1.2), expand = 0.9)
## Use datasets:USArrests
data(USArrests) # Get datafile
names(USArrests) # View variable names
## Scaled PCA using entire data.frame
pca1 = prcomp(USArrests, scale = TRUE)
## Both following commands produce same PCA as previous
pca2 = prcomp(~., data = USArrests, scale = TRUE)
pca3 = prcomp(~ Murder + Assault + Rape + UrbanPop,
data = USArrests, scale = TRUE)
pca1 # View result
names(pca1) # View elements in result object
summary(pca1) # Summary
## Plots for results...
## Scree-plot of variances
plot(pca1, type = "lines", main = "PCA for USArrests")