gravatar

julian_antonio

Julian

Recently Published

hist:t-c piped
p <- plot_ly(x = (t-c), type = "histogram") %>% print()
strike~trigger resid
t-c
hist(t-c) p <- plot_ly(x = (t-c), type = "histogram") p
boxplot()
library(TTR) > set.seed <- 1234 > x <- data.frame(runif(20,10,90)) > x[[2]] <- runMax(x, n=5) > x tb1 %>% group_by(Timeframe.x) %>% summarise(cumsum=cumsum(SPX.Close.x)) cummax(x) z<-tb1 %>% group_by(Timeframe.x) %>% cummax(tb1$SPX.Open.x) z cummax(tb1$SPX.Open.x) tb2<-tb1 %>% group_by(Timeframe.x) %>% aggregate(cummax(tb1$SPX.Open.y)) a<-cummax(tb1$SPX.Open.y) a summary(a) df<-tb tb1$year<-factor(tb1$year) df %>% group_by(as.numeric(year)) %>% # not working as factors mutate(SoFarMax = cummax(Timeframe.x)) # Define the cars vector with 5 values cars <- c(1, 3, 6, 4, 9) # Graph cars barplot(cars) plot<-a barplot(plot)
template by years
ggplot() + geom_line(data = tb1, aes(x = Timeframe.x, y = SPX.Open.x, color = year))
template
# plot the data using ggplot2 and pipes tb %>% ggplot(aes(x = `Date & Time`, y = SPX.Open.x)) + geom_point(color = "darkorchid4") + labs(title = "Precipitation - Boulder, Colorado", subtitle = "The data frame is sent to the plot using pipes", y = "Daily precipitation (inches)", x = "Date") + theme_bw(base_size = 15)
ROCR tb2 vip seg
library(ROCR) ROCRpred <- prediction(predict, data_test$top) ROCRperf <- performance(ROCRpred, 'tpr', 'fpr') plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2, 1.7))
cor_all_vars
library(GGally) # Convert data to numeric corr <- data.frame(lapply(test, as.integer)) # Plot the graph ggcorr(corr, method = c("pairwise", "spearman"), nbreaks = 6, hjust = 0.8, label = TRUE, label_size = 3, color = "grey50")
NGR
# box plot gender working time ggplot(test, aes(x = factor(top), y = NGR)) + geom_boxplot() + stat_summary(fun.y = mean, geom = "point", size = 3, color = "steelblue") + theme_classic()
example - logistic 3
formula <- income~. logit <- glm(formula, data = data_train, family = 'binomial') summary(logit) lapply(logit, class)[1:3] logit$aic predict <- predict(logit, data_test, type = 'response') # confusion matrix table_mat <- table(data_test$income, predict > 0.5) table_mat accuracy_Test <- sum(diag(table_mat)) / sum(table_mat) accuracy_Test # Precision vs Recall # Precision looks at the accuracy of the positive prediction. Recall is the ratio # of positive instances that are correctly detected by the classifier; precision <- function(matrix) { # True positive tp <- matrix[2, 2] # false positive fp <- matrix[1, 2] return (tp / (tp + fp)) } recall <- function(matrix) { # true positive tp <- matrix[2, 2]# false positive fn <- matrix[2, 1] return (tp / (tp + fn)) } prec <- precision(table_mat) prec rec <- recall(table_mat) rec library(ROCR) ROCRpred <- prediction(predict, data_test$income) ROCRperf <- performance(ROCRpred, 'tpr', 'fpr') plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2, 1.7))
example - logistic correlation GGally package
library(GGally) # Convert data to numeric corr <- data.frame(lapply(recast_data, as.integer)) # Plot the graphggcorr(corr, method = c("pairwise", "spearman"), nbreaks = 6, hjust = 0.8, label = TRUE, label_size = 3, color = "grey50")
example - logistic
# https://www.guru99.com/r-generalized-linear-model.html library(dplyr) data_adult <- read.csv("https://raw.githubusercontent.com/thomaspernet/data_csv_r/master/data/data_adult.csv") glimpse(data_adult) continuous <-select_if(data_adult, is.numeric) summary(continuous) # Histogram with kernel density curve library(ggplot2) ggplot(continuous, aes(x = hours.per.week)) + geom_density(alpha = .2, fill = "#FF6666") top_one_percent <- quantile(data_adult$hours.per.week, .99) top_one_percent data_adult_drop <-data_adult %>% filter(hours.per.week<top_one_percent) dim(data_adult_drop) data_adult_rescale <- data_adult_drop %>% mutate_if(is.numeric, funs(as.numeric(scale(.)))) head(data_adult_rescale) # Select categorical column factor <- data.frame(select_if(data_adult_rescale, is.factor)) ncol(factor) library(ggplot2) # Create graph for each column graph <- lapply(names(factor), function(x) ggplot(factor, aes(get(x))) + geom_bar() + theme(axis.text.x = element_text(angle = 90))) graph recast_data <- data_adult_rescale %>% select(-X) %>% mutate(education = factor( ifelse(education == "Preschool" | education == "10th" | education == "11th" | education == "12th" | education == "1st-4th" | education == "5th-6th" | education == "7th-8th" | education == "9th", "dropout", ifelse(education == "HS-grad", "HighGrad", ifelse(education == "Some-college" | education == "Assoc-acdm" | education == "Assoc-voc", "Community", ifelse(education == "Bachelors", "Bachelors", ifelse(education == "Masters" | education == "Prof-school", "Master", "PhD"))))))) recast_data %>% group_by(education) %>% summarize(average_educ_year = mean(educational.num), count = n()) %>% arrange(average_educ_year) # Change level marry recast_data <- recast_data %>% mutate(marital.status = factor(ifelse(marital.status == "Never-married" | marital.status == "Married-spouse-absent", "Not_married", ifelse(marital.status == "Married-AF-spouse" | marital.status == "Married-civ-spouse", "Married", ifelse(marital.status == "Separated" | marital.status == "Divorced", "Separated", "Widow"))))) table(recast_data$marital.status) # Plot gender income ggplot(recast_data, aes(x = gender, fill = income)) + geom_bar(position = "fill") + theme_classic() # Plot origin income ggplot(recast_data, aes(x = race, fill = income)) + geom_bar(position = "fill") + theme_classic() + theme(axis.text.x = element_text(angle = 90)) # box plot gender working time ggplot(recast_data, aes(x = gender, y = hours.per.week)) + geom_boxplot() + stat_summary(fun.y = mean, geom = "point", size = 3, color = "steelblue") + theme_classic() # Plot distribution working time by education ggplot(recast_data, aes(x = hours.per.week)) + geom_density(aes(color = education), alpha = 0.5) + theme_classic() anova <- aov(hours.per.week~education, recast_data) summary(anova) library(ggplot2) ggplot(recast_data, aes(x = age, y = hours.per.week)) + geom_point(aes(color = income), size = 0.5) + stat_smooth(method = 'lm', formula = y~poly(x, 2), se = TRUE, aes(color = income)) + theme_classic()
chaid tree v1
#https://rstudio-pubs-static.s3.amazonaws.com/219097_49ca9b3b3a724f3c9bfed56551c37bc4.html set.seed(290875) data("tb2cF") tb2cF_sampleS <- tb2cF[sample(1:nrow(tb2cF), 125000),] ctrl <- chaid_control(minsplit = 20, minbucket = 5, minprob = 0) chaid <- chaid(top ~ ., data = tb2cF, control = ctrl) print(chaid) plot(chaid, type="simple")
Plot
#https://rstudio-pubs-static.s3.amazonaws.com/219097_49ca9b3b3a724f3c9bfed56551c37bc4.html set.seed(290875) data("tb2cF") tb2cF_sampleS <- tb2cF[sample(1:nrow(tb2cF), 125000),] ctrl <- chaid_control(minsplit = 20, minbucket = 5, minprob = 0) chaid <- chaid(top ~ ., data = tb2cF, control = ctrl) print(chaid) plot(chaid)
ctree
tb2g library("party") # The first thing that we will do is remove all of the records with missing ozone data mod18 <- ctree(top ~ ., data = tb2g, controls = ctree_control(maxsurrogate = 3)) # Next we use ctree to construct a model of ozone as a function of all other covariates mod18.ct <- ctree(top ~ ., data = tb2g, controls = ctree_control(maxsurrogate = 3)) mod18.ct plot(mod18.ct)
tree example
library(ISLR) data(package="ISLR") carseats<-Carseats require(tree) names(carseats) hist(carseats$Sales) High = ifelse(carseats$Sales<=8, "No", "Yes") carseats = data.frame(carseats, High) tree.carseats = tree(High~.-Sales, data=carseats) summary(tree.carseats) plot(tree.carseats) text(tree.carseats, pretty = 0)
tree #1
# decision tree #### install.packages("rpart") library(rpart) #grow tree fit <- rpart(top ~ . , method="class", data=tb2f) printcp(fit) # display the results # cross-validation results plotcp(fit) # detailed summary of splits summary(fit) # plot tree plot(fit, uniform=TRUE, main="Classification Tree for Top ~ NUTS2_NAME + ACORN_TYPE_DESC") text(fit, use.n=TRUE, all=TRUE, cex=.8)
scaled numeric
source("http://www.sthda.com/upload/rquery_cormat.r") rquery.cormat(tb2c, type="full")
plotly box plot
NUTS~ACORN v1
install.packages("PerformanceAnalytics") library("PerformanceAnalytics") my_data <- data_v2_Trsf[, c(2,3,4,5)] chart.Correlation(my_data, histogram=TRUE, pch=19)
seg_behaviour frequency
seg_behaviour <- library(ggplot2) ggplot(data = data_v1) + aes(x = SEG_BEHAVIOUR) + geom_bar(fill = "#0c4c8a") + theme_minimal() + facet_wrap(vars(SEG_BEHAVIOUR))
Gamma Dist - HDI / Quatile / Altryx Limits for 2σ
C:\Users\julian.hillierfry\Google Drive\R\HDI\fun v6.RMD
Part 1 Bays package BayesVar Sel
> library(BayesVarSel) Loading required package: MASS Loading required package: mvtnorm Loading required package: parallel > demo(BayesVarSel.Hald) demo(BayesVarSel.Hald) ---- ~~~~~~~~~~~~~~~~ Type <Return> to start : > #read Hald data > data(Hald) > #run the main function: (in this small example we keep all models) > hald.Bvs<- Bvs(formula="y~x1+x2+x3+x4", data=Hald, n.keep=16) Info. . . . Most complex model has 5 covariates From those 1 is fixed and we should select from the remaining 4 x1, x2, x3, x4 The problem has a total of 16 competing models Of these, the 16 most probable (a posteriori) are kept Working on the problem...please wait. > #print the result > hald.Bvs Call: Bvs(formula = "y~x1+x2+x3+x4", data = Hald, n.keep = 16) The 10 most probable models and their probabilities are: x1 x2 x3 x4 prob 1 * * 4.743649e-01 2 * * 1.516526e-01 3 * * * 1.115362e-01 4 * * * 1.102744e-01 5 * * * 8.864006e-02 6 * * * * 3.970713e-02 7 * * * 2.028513e-02 8 * * 3.421526e-03 9 * * 7.799539e-05 10 * 1.748812e-05 (The remanining 6 models are kept but omitted in this print) > #obtain a summary of the result > summary(hald.Bvs) Call: Bvs(formula = "y~x1+x2+x3+x4", data = Hald, n.keep = 16) Inclusion Probabilities: Incl.prob. HPM MPM x1 0.9762 * * x2 0.7563 * * x3 0.2624 x4 0.4153 --- Code: HPM stands for Highest posterior Probability Model and MPM for Median Probability Model. > #plots for the result > #image plot of the conditional probabilities > plot(hald.Bvs, option="conditional")
poly fitting with prior linear and training sample
library(tidyverse) library(caret) library(MASS) theme_set(theme_classic()) # Preparing the data # Load the data data("Boston", package = "MASS") # Split the data into training and test set set.seed(123) training.samples <- Boston$medv %>% createDataPartition(p = 0.8, list = FALSE) train.data <- Boston[training.samples, ] test.data <- Boston[-training.samples, ] # First, visualize the scatter plot of the medv vs lstat variables as follow: ggplot(train.data, aes(lstat, medv) ) + geom_point() + stat_smooth() # Linear regression {linear-reg} # Build the model model <- lm(medv ~ lstat, data = train.data) # Make predictions predictions <- model %>% predict(test.data) # Model performance data.frame( RMSE = RMSE(predictions, test.data$medv), R2 = R2(predictions, test.data$medv) ) # Visualize the data: ggplot(train.data, aes(lstat, medv) ) + geom_point() + stat_smooth(method = lm, formula = y ~ x) # Polynomial regression # In R, to create a predictor x^2 you should use the function I(), as follow: I(x^2). This raise x to the power 2. # The polynomial regression can be computed in R as follow: lm(medv ~ lstat + I(lstat^2), data = train.data) # An alternative simple solution is to use this: lm(medv ~ poly(lstat, 2, raw = TRUE), data = train.data) # The output contains two coefficients associated with lstat : one for the linear term (lstat^1) and one for the quadratic term (lstat^2). # The following example computes a sixfth-order polynomial fit: lm(medv ~ poly(lstat, 6, raw = TRUE), data = train.data) %>% summary() #From the output above, it can be seen that polynomial terms beyond the fith order are not significant. So, just create a fith polynomial regression model as follow: # Build the model model <- lm(medv ~ poly(lstat, 5, raw = TRUE), data = train.data) # Make predictions predictions <- model %>% predict(test.data) # Model performance data.frame( RMSE = RMSE(predictions, test.data$medv), R2 = R2(predictions, test.data$medv) ) # Visualize the fith polynomial regression line as follow: ggplot(train.data, aes(lstat, medv) ) + geom_point() + stat_smooth(method = lm, formula = y ~ poly(x, 5, raw = TRUE))
poly fit template
p <- 0.5 q <- seq(0,100,1) y <- p*q plot(q,y,type='l',col='red',main='Linear relationship') y <- 450 + p*(q-10)^3 plot(q,y,type='l',col='navy',main='Nonlinear relationship',lwd=3) set.seed(20) q <- seq(from=0, to=20, by=0.1) # Value to predict (y): y <- 500 + 0.4 * (q-10)^3 # Some noise is generated and added to the real signal (y): noise <- rnorm(length(q), mean=10, sd=80) noisy.y <- y + noise #WORKING plot(q,noisy.y,col='deepskyblue4',xlab='q',main='Observed data') lines(q,y,col='firebrick1',lwd=3) # Let’s fit it using R. When fitting polynomials you can either use model <- lm(noisy.y ~ poly(q,3)) # OR (first optino lets you avoid this by producing orthogonal polynomials) model <- lm(noisy.y ~ x + I(X^2) + I(X^3)) #plot(fitted(model),residuals(model)) # Predicted values and confidence intervals: predicted.intervals <- predict(model,data.frame(x=q),interval='confidence', level=0.99) # Add lines to the existing plot: lines(q,predicted.intervals[,1],col='green',lwd=3) lines(q,predicted.intervals[,2],col='red',lwd=1) lines(q,predicted.intervals[,3],col='black',lwd=1) # Legend legend("bottomright",c("Observ.","Signal","Predicted"), col=c("deepskyblue4","red","green"), lwd=3)
Box plot
IV by group e <- data.frame(group = OHLC_DAILY_P_IV_SPX$IVcat1, x=OHLC_DAILY_P_IV_SPX$SPX.Open.y) ggplot(e) + geom_boxplot(aes(factor(group), x))
Trigger exp v0
origin <- OHLC_DAILY_P_IV_SPX$H_E_h version1 <- OHLC_DAILY_P_IV_SPX$H_E_h_v1.1 # gererate normal distribution with same mean and sd origin_norm <- rnorm(200,mean=mean(origin, na.rm=TRUE), sd=sd(origin, na.rm=TRUE)) version1_norm <- rnorm(200,mean=mean(version1, na.rm=TRUE), sd=sd(version1, na.rm=TRUE)) boxplot(origin, origin_norm, version1, version1_norm, main = "Multiple boxplots for comparision", at = c(1,2,4,5), names = c("origin", "normal", "version1", "normal"), las = 2, col = c("orange","red"), border = "brown", horizontal = TRUE, notch = TRUE )
Exp H to R_co
ggplot(data = OHLC_DAILY_P_IV_SPX) + aes(x = Date, y = E_h_co) + geom_line(color = "#0c4c8a") + theme_minimal()
time series HEH vs HEH_v1
# Quantify points which are in error OHLC_DAILY_P_IV_SPX$Er.H.Amt <- ifelse(test_expression, x, y) # testing EH_v2 amplification in coeff OHLC_DAILY_P_IV_SPX$H_E_h_v2 <- OHLC_DAILY_P_IV_SPX$E_h_v2 - OHLC_DAILY_P_IV_SPX$SPX.High.x # comparing results on same graph ggplot(OHLC_DAILY_P_IV_SPX, aes(Date)) + geom_line(aes(y = OHLC_DAILY_P_IV_SPX$H_E_h, color = OHLC_DAILY_P_IV_SPX$H_E_h)) + geom_line(aes(y = OHLC_DAILY_P_IV_SPX$H_E_h_v2, color = OHLC_DAILY_P_IV_SPX$H_E_h_v2))
Time series How often is the EHE < R_co
OHLC_DAILY_P_IV_SPX$Er.EHE.R_co <- (OHLC_DAILY_P_IV_SPX$E_h - OHLC_DAILY_P_IV_SPX$SPX.Open.x) - OHLC_DAILY_P_IV_SPX$R_co library(ggplot2) ggplot(data = OHLC_DAILY_P_IV_SPX) + aes(x = Date, y = Er.EHE.R_co) + geom_line(color = "#0c4c8a") + theme_minimal() summary(OHLC_DAILY_P_IV_SPX$Er.EHE.R_co) Min. 1st Qu. Median Mean 3rd Qu. Max. -58.882 6.535 13.276 15.797 22.008 131.778
HEH / H_o % ratio
ggplot(data = OHLC_DAILY_P_IV_SPX) + aes(x = Date, y = Er.H_E_H, color = SPX.Open.y) + geom_line() + scale_colour_viridis_c(option = "viridis") + labs(title = "Percentage of error to high ratio", y = "%") + theme_light()
Time series EHE with EH + 15% in coeff
OHLC_DAILY_P_IV_SPX$H_E_h_1 <- (OHLC_DAILY_P_IV_SPX$SPX.Open.x * ( 1 + OHLC_DAILY_P_IV_SPX$IV_d + 0.0015)) - OHLC_DAILY_P_IV_SPX$SPX.High.x ggplot(data = OHLC_DAILY_P_IV_SPX) + aes(x = Date, y = H_E_h_1) + geom_line(color = "#0c4c8a") + theme_minimal()
Time series EHE + 30
ggplot(data = OHLC_DAILY_P_IV_SPX) + aes(x = Date, y = H_E_h+30) + geom_line(color = "#0c4c8a") + theme_minimal()
Time series of EHE
ggplot(data = OHLC_DAILY_P_IV_SPX) + aes(x = Date, y = H_E_h) + geom_line(color = "#0c4c8a") + theme_minimal()
polynomial curve with fitting function
library("mvtnorm") # polynomial regression exammple # We create 2 vectors x and y x <- runif(300,  min=-10, max=10) y <- 0.1*x^3 - 0.5 * x^2 - x + 10 + rnorm(length(x),0,8) # plot of x and y : plot(x,y,col=rgb(0.4,0.4,0.8,0.6),pch=16 , cex=1.3) # Can we find a polynome that fit this function ? model2=lm(y ~ x + I(x^2) + I(x^3)) #I can get the features of this model : summary(model2) model2$coefficients summary(model2)$adj.r.squared #For each value of x, I can get the value of y estimated by the model, and add it to the current plot ! myPredict <- predict( model2 ) ix <- sort(x,index.return=T)$ix lines(x[ix], myPredict[ix], col=2, lwd=2 )   # I add the features of the model to the plot coeff=round(model$coefficients , 2) text(3,-70 , paste("Model : ",coeff[1] , " + " , coeff[2] , "*x"  , "+" , coeff[3] , "*x^2" , "+" , coeff[4] , "*x^3" , "\n\n" , "P-value adjusted = ",round(summary(model)$adj.r.squared,2)))
Plot
#graphical model ggplot(OHLC_DAILY_P_IV_SPX, aes(SPX.Open.y, H_E_h)) + geom_point(col = "red", size = 0.1) + stat_smooth(method = lm) + geom_abline(intercept = 5.7551, slope = 8.1988) # geom_abline(lsfit(OHLC_DAILY_P_IV_SPX$SPX.Open.y,OHLC_DAILY_P_IV_SPX$H_E_h))
Error proportional to IV increase
#graphical model ggplot(OHLC_DAILY_P_IV_SPX, aes(SPX.Open.y, H_E_h)) + geom_point(col = "red", size = 0.1) + stat_smooth(method = lm) + geom_abline(intercept = 4.95, slope = 9) # geom_abline(lsfit(OHLC_DAILY_P_IV_SPX$SPX.Open.y,OHLC_DAILY_P_IV_SPX$H_E_h))
Plot
Plot
Plot
iv_diff_exp_realised
Plot
reseach for implavo
Research for Implavo
Document
HTML Made colums Range H-L
Plot