Recently Published
hist:t-c piped
p <- plot_ly(x = (t-c), type = "histogram") %>% print()
t-c
hist(t-c)
p <- plot_ly(x = (t-c), type = "histogram")
p
boxplot()
library(TTR)
> set.seed <- 1234
> x <- data.frame(runif(20,10,90))
> x[[2]] <- runMax(x, n=5)
> x
tb1 %>% group_by(Timeframe.x) %>% summarise(cumsum=cumsum(SPX.Close.x))
cummax(x)
z<-tb1 %>% group_by(Timeframe.x) %>% cummax(tb1$SPX.Open.x)
z
cummax(tb1$SPX.Open.x)
tb2<-tb1 %>% group_by(Timeframe.x) %>% aggregate(cummax(tb1$SPX.Open.y))
a<-cummax(tb1$SPX.Open.y)
a
summary(a)
df<-tb
tb1$year<-factor(tb1$year)
df %>%
group_by(as.numeric(year)) %>% # not working as factors
mutate(SoFarMax = cummax(Timeframe.x))
# Define the cars vector with 5 values
cars <- c(1, 3, 6, 4, 9)
# Graph cars
barplot(cars)
plot<-a
barplot(plot)
template by years
ggplot() + geom_line(data = tb1, aes(x = Timeframe.x, y = SPX.Open.x, color = year))
template
# plot the data using ggplot2 and pipes
tb %>%
ggplot(aes(x = `Date & Time`, y = SPX.Open.x)) +
geom_point(color = "darkorchid4") +
labs(title = "Precipitation - Boulder, Colorado",
subtitle = "The data frame is sent to the plot using pipes",
y = "Daily precipitation (inches)",
x = "Date") + theme_bw(base_size = 15)
ROCR tb2 vip seg
library(ROCR)
ROCRpred <- prediction(predict, data_test$top)
ROCRperf <- performance(ROCRpred, 'tpr', 'fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2, 1.7))
cor_all_vars
library(GGally)
# Convert data to numeric
corr <- data.frame(lapply(test, as.integer))
# Plot the graph
ggcorr(corr,
method = c("pairwise", "spearman"),
nbreaks = 6,
hjust = 0.8,
label = TRUE,
label_size = 3,
color = "grey50")
NGR
# box plot gender working time
ggplot(test, aes(x = factor(top), y = NGR)) +
geom_boxplot() +
stat_summary(fun.y = mean,
geom = "point",
size = 3,
color = "steelblue") +
theme_classic()
example - logistic 3
formula <- income~.
logit <- glm(formula, data = data_train, family = 'binomial')
summary(logit)
lapply(logit, class)[1:3]
logit$aic
predict <- predict(logit, data_test, type = 'response')
# confusion matrix
table_mat <- table(data_test$income, predict > 0.5)
table_mat
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
# Precision vs Recall
# Precision looks at the accuracy of the positive prediction. Recall is the ratio
# of positive instances that are correctly detected by the classifier;
precision <- function(matrix) {
# True positive
tp <- matrix[2, 2]
# false positive
fp <- matrix[1, 2]
return (tp / (tp + fp))
}
recall <- function(matrix) {
# true positive
tp <- matrix[2, 2]# false positive
fn <- matrix[2, 1]
return (tp / (tp + fn))
}
prec <- precision(table_mat)
prec
rec <- recall(table_mat)
rec
library(ROCR)
ROCRpred <- prediction(predict, data_test$income)
ROCRperf <- performance(ROCRpred, 'tpr', 'fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2, 1.7))
example - logistic correlation GGally package
library(GGally)
# Convert data to numeric
corr <- data.frame(lapply(recast_data, as.integer))
# Plot the graphggcorr(corr,
method = c("pairwise", "spearman"),
nbreaks = 6,
hjust = 0.8,
label = TRUE,
label_size = 3,
color = "grey50")
example - logistic
# https://www.guru99.com/r-generalized-linear-model.html
library(dplyr)
data_adult <-
read.csv("https://raw.githubusercontent.com/thomaspernet/data_csv_r/master/data/data_adult.csv")
glimpse(data_adult)
continuous <-select_if(data_adult, is.numeric)
summary(continuous)
# Histogram with kernel density curve
library(ggplot2)
ggplot(continuous, aes(x = hours.per.week)) +
geom_density(alpha = .2, fill = "#FF6666")
top_one_percent <- quantile(data_adult$hours.per.week, .99)
top_one_percent
data_adult_drop <-data_adult %>%
filter(hours.per.week<top_one_percent)
dim(data_adult_drop)
data_adult_rescale <- data_adult_drop %>%
mutate_if(is.numeric, funs(as.numeric(scale(.))))
head(data_adult_rescale)
# Select categorical column
factor <- data.frame(select_if(data_adult_rescale, is.factor))
ncol(factor)
library(ggplot2)
# Create graph for each column
graph <- lapply(names(factor),
function(x)
ggplot(factor, aes(get(x))) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90)))
graph
recast_data <- data_adult_rescale %>%
select(-X) %>%
mutate(education = factor(
ifelse(education == "Preschool" | education == "10th" | education == "11th" | education == "12th" | education == "1st-4th" | education == "5th-6th" | education == "7th-8th" | education == "9th", "dropout", ifelse(education == "HS-grad", "HighGrad", ifelse(education == "Some-college" | education == "Assoc-acdm" | education == "Assoc-voc", "Community", ifelse(education == "Bachelors", "Bachelors",
ifelse(education == "Masters" | education == "Prof-school", "Master", "PhD")))))))
recast_data %>%
group_by(education) %>%
summarize(average_educ_year = mean(educational.num),
count = n()) %>%
arrange(average_educ_year)
# Change level marry
recast_data <- recast_data %>%
mutate(marital.status = factor(ifelse(marital.status == "Never-married" | marital.status == "Married-spouse-absent", "Not_married", ifelse(marital.status == "Married-AF-spouse" | marital.status == "Married-civ-spouse", "Married", ifelse(marital.status == "Separated" | marital.status == "Divorced", "Separated", "Widow")))))
table(recast_data$marital.status)
# Plot gender income
ggplot(recast_data, aes(x = gender, fill = income)) +
geom_bar(position = "fill") +
theme_classic()
# Plot origin income
ggplot(recast_data, aes(x = race, fill = income)) +
geom_bar(position = "fill") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))
# box plot gender working time
ggplot(recast_data, aes(x = gender, y = hours.per.week)) +
geom_boxplot() +
stat_summary(fun.y = mean,
geom = "point",
size = 3,
color = "steelblue") +
theme_classic()
# Plot distribution working time by education
ggplot(recast_data, aes(x = hours.per.week)) +
geom_density(aes(color = education), alpha = 0.5) +
theme_classic()
anova <- aov(hours.per.week~education, recast_data)
summary(anova)
library(ggplot2)
ggplot(recast_data, aes(x = age, y = hours.per.week)) +
geom_point(aes(color = income),
size = 0.5) +
stat_smooth(method = 'lm',
formula = y~poly(x, 2),
se = TRUE,
aes(color = income)) +
theme_classic()
chaid tree v1
#https://rstudio-pubs-static.s3.amazonaws.com/219097_49ca9b3b3a724f3c9bfed56551c37bc4.html
set.seed(290875)
data("tb2cF")
tb2cF_sampleS <- tb2cF[sample(1:nrow(tb2cF), 125000),]
ctrl <- chaid_control(minsplit = 20, minbucket = 5, minprob = 0)
chaid <- chaid(top ~ ., data = tb2cF, control = ctrl)
print(chaid)
plot(chaid, type="simple")
Plot
#https://rstudio-pubs-static.s3.amazonaws.com/219097_49ca9b3b3a724f3c9bfed56551c37bc4.html
set.seed(290875)
data("tb2cF")
tb2cF_sampleS <- tb2cF[sample(1:nrow(tb2cF), 125000),]
ctrl <- chaid_control(minsplit = 20, minbucket = 5, minprob = 0)
chaid <- chaid(top ~ ., data = tb2cF, control = ctrl)
print(chaid)
plot(chaid)
ctree
tb2g
library("party")
# The first thing that we will do is remove all of the records with missing ozone data
mod18 <- ctree(top ~ ., data = tb2g, controls = ctree_control(maxsurrogate = 3))
# Next we use ctree to construct a model of ozone as a function of all other covariates
mod18.ct <- ctree(top ~ ., data = tb2g, controls = ctree_control(maxsurrogate = 3))
mod18.ct
plot(mod18.ct)
tree example
library(ISLR)
data(package="ISLR")
carseats<-Carseats
require(tree)
names(carseats)
hist(carseats$Sales)
High = ifelse(carseats$Sales<=8, "No", "Yes")
carseats = data.frame(carseats, High)
tree.carseats = tree(High~.-Sales, data=carseats)
summary(tree.carseats)
plot(tree.carseats)
text(tree.carseats, pretty = 0)
tree #1
# decision tree ####
install.packages("rpart")
library(rpart)
#grow tree
fit <- rpart(top ~ . , method="class", data=tb2f)
printcp(fit) # display the results
# cross-validation results
plotcp(fit)
# detailed summary of splits
summary(fit)
# plot tree
plot(fit, uniform=TRUE, main="Classification Tree for Top ~ NUTS2_NAME + ACORN_TYPE_DESC")
text(fit, use.n=TRUE, all=TRUE, cex=.8)
scaled numeric
source("http://www.sthda.com/upload/rquery_cormat.r")
rquery.cormat(tb2c, type="full")
NUTS~ACORN v1
install.packages("PerformanceAnalytics")
library("PerformanceAnalytics")
my_data <- data_v2_Trsf[, c(2,3,4,5)]
chart.Correlation(my_data, histogram=TRUE, pch=19)
seg_behaviour frequency
seg_behaviour <- library(ggplot2)
ggplot(data = data_v1) +
aes(x = SEG_BEHAVIOUR) +
geom_bar(fill = "#0c4c8a") +
theme_minimal() +
facet_wrap(vars(SEG_BEHAVIOUR))
Gamma Dist - HDI / Quatile / Altryx Limits for 2σ
C:\Users\julian.hillierfry\Google Drive\R\HDI\fun v6.RMD
Part 1 Bays package BayesVar Sel
> library(BayesVarSel)
Loading required package: MASS
Loading required package: mvtnorm
Loading required package: parallel
> demo(BayesVarSel.Hald)
demo(BayesVarSel.Hald)
---- ~~~~~~~~~~~~~~~~
Type <Return> to start :
> #read Hald data
> data(Hald)
> #run the main function: (in this small example we keep all models)
> hald.Bvs<- Bvs(formula="y~x1+x2+x3+x4", data=Hald, n.keep=16)
Info. . . .
Most complex model has 5 covariates
From those 1 is fixed and we should select from the remaining 4
x1, x2, x3, x4
The problem has a total of 16 competing models
Of these, the 16 most probable (a posteriori) are kept
Working on the problem...please wait.
> #print the result
> hald.Bvs
Call:
Bvs(formula = "y~x1+x2+x3+x4", data = Hald, n.keep = 16)
The 10 most probable models and their probabilities are:
x1 x2 x3 x4 prob
1 * * 4.743649e-01
2 * * 1.516526e-01
3 * * * 1.115362e-01
4 * * * 1.102744e-01
5 * * * 8.864006e-02
6 * * * * 3.970713e-02
7 * * * 2.028513e-02
8 * * 3.421526e-03
9 * * 7.799539e-05
10 * 1.748812e-05
(The remanining 6 models are kept but omitted in this print)
> #obtain a summary of the result
> summary(hald.Bvs)
Call:
Bvs(formula = "y~x1+x2+x3+x4", data = Hald, n.keep = 16)
Inclusion Probabilities:
Incl.prob. HPM MPM
x1 0.9762 * *
x2 0.7563 * *
x3 0.2624
x4 0.4153
---
Code: HPM stands for Highest posterior Probability Model and
MPM for Median Probability Model.
> #plots for the result
> #image plot of the conditional probabilities
> plot(hald.Bvs, option="conditional")
poly fitting with prior linear and training sample
library(tidyverse)
library(caret)
library(MASS)
theme_set(theme_classic())
# Preparing the data
# Load the data
data("Boston", package = "MASS")
# Split the data into training and test set
set.seed(123)
training.samples <- Boston$medv %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- Boston[training.samples, ]
test.data <- Boston[-training.samples, ]
# First, visualize the scatter plot of the medv vs lstat variables as follow:
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth()
# Linear regression {linear-reg}
# Build the model
model <- lm(medv ~ lstat, data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$medv),
R2 = R2(predictions, test.data$medv)
)
# Visualize the data:
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ x)
# Polynomial regression
# In R, to create a predictor x^2 you should use the function I(), as follow: I(x^2). This raise x to the power 2.
# The polynomial regression can be computed in R as follow:
lm(medv ~ lstat + I(lstat^2), data = train.data)
# An alternative simple solution is to use this:
lm(medv ~ poly(lstat, 2, raw = TRUE), data = train.data)
# The output contains two coefficients associated with lstat : one for the linear term (lstat^1) and one for the quadratic term (lstat^2).
# The following example computes a sixfth-order polynomial fit:
lm(medv ~ poly(lstat, 6, raw = TRUE), data = train.data) %>%
summary()
#From the output above, it can be seen that polynomial terms beyond the fith order are not significant. So, just create a fith polynomial regression model as follow:
# Build the model
model <- lm(medv ~ poly(lstat, 5, raw = TRUE), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$medv),
R2 = R2(predictions, test.data$medv)
)
# Visualize the fith polynomial regression line as follow:
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ poly(x, 5, raw = TRUE))
poly fit template
p <- 0.5
q <- seq(0,100,1)
y <- p*q
plot(q,y,type='l',col='red',main='Linear relationship')
y <- 450 + p*(q-10)^3
plot(q,y,type='l',col='navy',main='Nonlinear relationship',lwd=3)
set.seed(20)
q <- seq(from=0, to=20, by=0.1)
# Value to predict (y):
y <- 500 + 0.4 * (q-10)^3
# Some noise is generated and added to the real signal (y):
noise <- rnorm(length(q), mean=10, sd=80)
noisy.y <- y + noise
#WORKING
plot(q,noisy.y,col='deepskyblue4',xlab='q',main='Observed data')
lines(q,y,col='firebrick1',lwd=3)
# Let’s fit it using R. When fitting polynomials you can either use
model <- lm(noisy.y ~ poly(q,3))
# OR (first optino lets you avoid this by producing orthogonal polynomials)
model <- lm(noisy.y ~ x + I(X^2) + I(X^3))
#plot(fitted(model),residuals(model))
# Predicted values and confidence intervals:
predicted.intervals <- predict(model,data.frame(x=q),interval='confidence',
level=0.99)
# Add lines to the existing plot:
lines(q,predicted.intervals[,1],col='green',lwd=3)
lines(q,predicted.intervals[,2],col='red',lwd=1)
lines(q,predicted.intervals[,3],col='black',lwd=1)
# Legend
legend("bottomright",c("Observ.","Signal","Predicted"),
col=c("deepskyblue4","red","green"), lwd=3)
Box plot
IV by group
e <- data.frame(group = OHLC_DAILY_P_IV_SPX$IVcat1, x=OHLC_DAILY_P_IV_SPX$SPX.Open.y)
ggplot(e) + geom_boxplot(aes(factor(group), x))
Trigger exp v0
origin <- OHLC_DAILY_P_IV_SPX$H_E_h
version1 <- OHLC_DAILY_P_IV_SPX$H_E_h_v1.1
# gererate normal distribution with same mean and sd
origin_norm <- rnorm(200,mean=mean(origin, na.rm=TRUE), sd=sd(origin, na.rm=TRUE))
version1_norm <- rnorm(200,mean=mean(version1, na.rm=TRUE), sd=sd(version1, na.rm=TRUE))
boxplot(origin, origin_norm, version1, version1_norm,
main = "Multiple boxplots for comparision",
at = c(1,2,4,5),
names = c("origin", "normal", "version1", "normal"),
las = 2,
col = c("orange","red"),
border = "brown",
horizontal = TRUE,
notch = TRUE
)
Exp H to R_co
ggplot(data = OHLC_DAILY_P_IV_SPX) +
aes(x = Date, y = E_h_co) +
geom_line(color = "#0c4c8a") +
theme_minimal()
time series HEH vs HEH_v1
# Quantify points which are in error
OHLC_DAILY_P_IV_SPX$Er.H.Amt <- ifelse(test_expression, x, y)
# testing EH_v2 amplification in coeff
OHLC_DAILY_P_IV_SPX$H_E_h_v2 <- OHLC_DAILY_P_IV_SPX$E_h_v2 - OHLC_DAILY_P_IV_SPX$SPX.High.x
# comparing results on same graph
ggplot(OHLC_DAILY_P_IV_SPX, aes(Date)) +
geom_line(aes(y = OHLC_DAILY_P_IV_SPX$H_E_h, color = OHLC_DAILY_P_IV_SPX$H_E_h)) +
geom_line(aes(y = OHLC_DAILY_P_IV_SPX$H_E_h_v2, color = OHLC_DAILY_P_IV_SPX$H_E_h_v2))
Time series How often is the EHE < R_co
OHLC_DAILY_P_IV_SPX$Er.EHE.R_co <- (OHLC_DAILY_P_IV_SPX$E_h - OHLC_DAILY_P_IV_SPX$SPX.Open.x) - OHLC_DAILY_P_IV_SPX$R_co
library(ggplot2)
ggplot(data = OHLC_DAILY_P_IV_SPX) +
aes(x = Date, y = Er.EHE.R_co) +
geom_line(color = "#0c4c8a") +
theme_minimal()
summary(OHLC_DAILY_P_IV_SPX$Er.EHE.R_co)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-58.882 6.535 13.276 15.797 22.008 131.778
HEH / H_o % ratio
ggplot(data = OHLC_DAILY_P_IV_SPX) +
aes(x = Date, y = Er.H_E_H, color = SPX.Open.y) +
geom_line() +
scale_colour_viridis_c(option = "viridis") +
labs(title = "Percentage of error to high ratio",
y = "%") +
theme_light()
Time series EHE with EH + 15% in coeff
OHLC_DAILY_P_IV_SPX$H_E_h_1 <- (OHLC_DAILY_P_IV_SPX$SPX.Open.x * ( 1 + OHLC_DAILY_P_IV_SPX$IV_d + 0.0015)) - OHLC_DAILY_P_IV_SPX$SPX.High.x
ggplot(data = OHLC_DAILY_P_IV_SPX) +
aes(x = Date, y = H_E_h_1) +
geom_line(color = "#0c4c8a") +
theme_minimal()
Time series EHE + 30
ggplot(data = OHLC_DAILY_P_IV_SPX) +
aes(x = Date, y = H_E_h+30) +
geom_line(color = "#0c4c8a") +
theme_minimal()
Time series of EHE
ggplot(data = OHLC_DAILY_P_IV_SPX) +
aes(x = Date, y = H_E_h) +
geom_line(color = "#0c4c8a") +
theme_minimal()
polynomial curve with fitting function
library("mvtnorm")
# polynomial regression exammple
# We create 2 vectors x and y
x <- runif(300, min=-10, max=10)
y <- 0.1*x^3 - 0.5 * x^2 - x + 10 + rnorm(length(x),0,8)
# plot of x and y :
plot(x,y,col=rgb(0.4,0.4,0.8,0.6),pch=16 , cex=1.3)
# Can we find a polynome that fit this function ?
model2=lm(y ~ x + I(x^2) + I(x^3))
#I can get the features of this model :
summary(model2)
model2$coefficients
summary(model2)$adj.r.squared
#For each value of x, I can get the value of y estimated by the model, and add it to the current plot !
myPredict <- predict( model2 )
ix <- sort(x,index.return=T)$ix
lines(x[ix], myPredict[ix], col=2, lwd=2 )
# I add the features of the model to the plot
coeff=round(model$coefficients , 2)
text(3,-70 , paste("Model : ",coeff[1] , " + " , coeff[2] , "*x" , "+" , coeff[3] , "*x^2" , "+" , coeff[4] , "*x^3" , "\n\n" , "P-value adjusted = ",round(summary(model)$adj.r.squared,2)))
Plot
#graphical model
ggplot(OHLC_DAILY_P_IV_SPX, aes(SPX.Open.y, H_E_h)) +
geom_point(col = "red", size = 0.1) +
stat_smooth(method = lm) +
geom_abline(intercept = 5.7551, slope = 8.1988)
# geom_abline(lsfit(OHLC_DAILY_P_IV_SPX$SPX.Open.y,OHLC_DAILY_P_IV_SPX$H_E_h))
Error proportional to IV increase
#graphical model
ggplot(OHLC_DAILY_P_IV_SPX, aes(SPX.Open.y, H_E_h)) +
geom_point(col = "red", size = 0.1) +
stat_smooth(method = lm) +
geom_abline(intercept = 4.95, slope = 9)
# geom_abline(lsfit(OHLC_DAILY_P_IV_SPX$SPX.Open.y,OHLC_DAILY_P_IV_SPX$H_E_h))