Commit 6d376d34 authored by Stanley Clark's avatar Stanley Clark
Browse files

More analysis

parent c4caac1e
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -11,7 +11,8 @@ pacman::p_load(
ggallin,
formattable,
grid,
gtable
gtable,
tidyverse
)
options(tz = "Europe/Amsterdam")
......@@ -19,6 +20,15 @@ source("stats.R")
source("plots.R")
# Load clean results
results <- rbind(
get_results(1, "results1-wrong.csv"),
get_results(5, "results5-wrong.csv"),
get_results(10, "results10-wrong.csv")
)
results <- rbind(
get_results(1, "results1-new.csv"),
get_results(2, "results2-new.csv")
)
results <- rbind(
get_results(1, "results1.csv"),
get_results(2, "results2.csv")
......@@ -26,10 +36,16 @@ results <- rbind(
stats <- get_stats(results)
# Print plots to the screen
diff_plan_time()
box_db_query_absolute_log()
box_db_query_relative()
stacked_means()
diff_plan <- diff_plan_time()
stacked <- stacked_means()
abs1 <- box_db_query_absolute_log(1)
rel1 <- box_db_query_relative(1)
abs2 <- box_db_query_absolute_log(2)
rel2 <- box_db_query_relative(2)
grid.arrange(abs1, abs2, rel1, rel2, diff_plan, stacked, nrow = 3, ncol = 2)
# Save all graphs to individual tex files
save_half("planning_time_joins-2.tex", diff_plan_time)
......@@ -38,6 +54,6 @@ save_half("rel_diff_execution_time-2.tex", box_db_query_relative)
save_half("stacked_means-2.tex", stacked_means)
save_third("planning_time_joins-3.tex", diff_plan_time)
save_third("diff_execution_time_log-3.tex", box_db_query_absolute_log)
save_third("rel_diff_execution_time-3.tex", box_db_query_relative)
save_third("diff_execution_time_log-3.tex", function() { box_db_query_absolute_log(1) })
save_third("rel_diff_execution_time-3.tex", function() { box_db_query_relative(1) })
save_to_tex("stacked_means-3.tex", stacked_means, 3.3, 2.2)
% Created by tikzDevice version 0.12.3.1 on 2020-09-04 15:28:01
% Created by tikzDevice version 0.12.3.1 on 2020-09-08 11:21:57
% !TEX encoding = UTF-8 Unicode
\begin{tikzpicture}[x=1pt,y=1pt]
\definecolor{fillColor}{RGB}{255,255,255}
......@@ -17,29 +17,32 @@
\path[fill=fillColor] ( 33.06, 25.92) rectangle (153.49,119.45);
\definecolor{drawColor}{gray}{0.92}
\path[draw=drawColor,line width= 0.3pt,line join=round] ( 33.06, 38.31) --
(153.49, 38.31);
\path[draw=drawColor,line width= 0.3pt,line join=round] ( 33.06, 37.84) --
(153.49, 37.84);
\path[draw=drawColor,line width= 0.3pt,line join=round] ( 33.06, 61.91) --
(153.49, 61.91);
\path[draw=drawColor,line width= 0.3pt,line join=round] ( 33.06, 60.34) --
(153.49, 60.34);
\path[draw=drawColor,line width= 0.3pt,line join=round] ( 33.06, 85.52) --
(153.49, 85.52);
\path[draw=drawColor,line width= 0.3pt,line join=round] ( 33.06, 82.84) --
(153.49, 82.84);
\path[draw=drawColor,line width= 0.3pt,line join=round] ( 33.06,109.13) --
(153.49,109.13);
\path[draw=drawColor,line width= 0.3pt,line join=round] ( 33.06,105.34) --
(153.49,105.34);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 33.06, 26.50) --
(153.49, 26.50);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 33.06, 26.59) --
(153.49, 26.59);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 33.06, 50.11) --
(153.49, 50.11);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 33.06, 49.09) --
(153.49, 49.09);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 33.06, 73.72) --
(153.49, 73.72);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 33.06, 71.59) --
(153.49, 71.59);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 33.06, 97.33) --
(153.49, 97.33);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 33.06, 94.09) --
(153.49, 94.09);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 33.06,116.59) --
(153.49,116.59);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 46.96, 25.92) --
( 46.96,119.45);
......@@ -57,16 +60,16 @@
(139.60,119.45);
\definecolor{drawColor}{RGB}{0,0,0}
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 46.96, 30.50) --
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 46.96, 30.58) --
( 70.12, 30.17) --
( 93.28, 43.77) --
(116.44, 69.71) --
(139.60,113.88);
\path[draw=drawColor,line width= 0.6pt,dash pattern=on 2pt off 2pt ,line join=round] ( 46.96, 30.68) --
( 70.12, 30.42) --
( 93.28, 44.54) --
(116.44, 70.77) --
( 93.28, 43.43) --
(116.44, 69.08) --
(139.60,113.71);
\path[draw=drawColor,line width= 0.6pt,dash pattern=on 2pt off 2pt ,line join=round] ( 46.96, 30.63) --
( 70.12, 30.30) --
( 93.28, 44.18) --
(116.44, 69.88) --
(139.60,115.20);
\definecolor{drawColor}{gray}{0.20}
......@@ -76,29 +79,34 @@
\path[clip] ( 0.00, 0.00) rectangle (158.99,158.99);
\definecolor{drawColor}{gray}{0.30}
\node[text=drawColor,anchor=base east,inner sep=0pt, outer sep=0pt, scale= 0.64] at ( 28.11, 24.30) {0};
\node[text=drawColor,anchor=base east,inner sep=0pt, outer sep=0pt, scale= 0.64] at ( 28.11, 24.39) {0};
\node[text=drawColor,anchor=base east,inner sep=0pt, outer sep=0pt, scale= 0.64] at ( 28.11, 46.89) {500};
\node[text=drawColor,anchor=base east,inner sep=0pt, outer sep=0pt, scale= 0.64] at ( 28.11, 47.91) {500};
\node[text=drawColor,anchor=base east,inner sep=0pt, outer sep=0pt, scale= 0.64] at ( 28.11, 69.39) {1000};
\node[text=drawColor,anchor=base east,inner sep=0pt, outer sep=0pt, scale= 0.64] at ( 28.11, 71.51) {1000};
\node[text=drawColor,anchor=base east,inner sep=0pt, outer sep=0pt, scale= 0.64] at ( 28.11, 91.89) {1500};
\node[text=drawColor,anchor=base east,inner sep=0pt, outer sep=0pt, scale= 0.64] at ( 28.11, 95.12) {1500};
\node[text=drawColor,anchor=base east,inner sep=0pt, outer sep=0pt, scale= 0.64] at ( 28.11,114.39) {2000};
\end{scope}
\begin{scope}
\path[clip] ( 0.00, 0.00) rectangle (158.99,158.99);
\definecolor{drawColor}{gray}{0.20}
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 30.31, 26.50) --
( 33.06, 26.50);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 30.31, 26.59) --
( 33.06, 26.59);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 30.31, 49.09) --
( 33.06, 49.09);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 30.31, 50.11) --
( 33.06, 50.11);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 30.31, 71.59) --
( 33.06, 71.59);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 30.31, 73.72) --
( 33.06, 73.72);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 30.31, 94.09) --
( 33.06, 94.09);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 30.31, 97.33) --
( 33.06, 97.33);
\path[draw=drawColor,line width= 0.6pt,line join=round] ( 30.31,116.59) --
( 33.06,116.59);
\end{scope}
\begin{scope}
\path[clip] ( 0.00, 0.00) rectangle (158.99,158.99);
......
This diff is collapsed.
......@@ -11,40 +11,43 @@ latex_percent <- function (x) {
stringr::str_c(round(x * 100, 0), "\\%")
}
# 1. Difference between planning times per number of joins
# 1. Difference between planning times per number of joins over all scale factors
diff_plan_time <- function() {
df <- stats$meansuserid[stats$meansuserid$scale == 2 & stats$meansuserid$query %in% c(3, 96, 7, 26, 19, 25),]
df <- df[,c(3,6,10)]
df <- stats$meansuserid[stats$meansuserid$query %in% c(3, 96, 7, 26, 19, 25),]
df <- df[,c("scale", "case", "plan", "num_joins")]
df <- aggregate(plan ~ case + num_joins, data = df, mean)
ggplot(df, aes(x = num_joins, y = plan, group = case)) +
geom_line(aes(linetype = case)) +
ggplot_theme +
labs(y = "Absolute planning time (ms)", x = "Number of joins", linetype = "Variant") +
scale_linetype_discrete(labels = ggplot_case_labels)
scale_linetype_discrete(labels = ggplot_case_labels) +
ggtitle("Planning time per number of joins")
}
# 2. Box plot of distribution of mean DB execution times of different user IDs
# between two cases across queries
box_db_query_absolute_log <- function() {
ggplot(stats$means[stats$means$scale == 2,], aes(x = db, y = query, fill = case)) +
box_db_query_absolute_log <- function(scale) {
ggplot(stats$means[stats$means$scale == scale,], aes(x = db, y = query, fill = case)) +
geom_boxplot(outlier.size = 0.1) +
ggplot_theme +
scale_x_continuous(trans = pseudolog10_trans) +
geom_vline(xintercept = 0, linetype = "dotted") +
labs(y = "Query", x = "Execution time (ms)", fill = "Variant") +
scale_fill_discrete(labels = ggplot_case_labels) +
theme(axis.text.x = element_text(angle = 90))
theme(axis.text.x = element_text(angle = 90)) +
ggtitle(paste("Average Absolute DB Execution times per query with and without sec-RA rules (", toString(scale), "GB )"))
}
# 3. Box plot of relative increases in DB execution time
box_db_query_relative <- function() {
ggplot(stats$rel_diff[stats$rel_diff$scale == 2, ], aes(x = db, y = query)) +
box_db_query_relative <- function(scale) {
ggplot(stats$rel_diff[stats$rel_diff$scale == scale, ], aes(x = db, y = query)) +
geom_boxplot(outlier.size = 0.1) +
ggplot_theme +
geom_vline(xintercept = 0, linetype = "dotted") +
scale_x_continuous(labels = latex_percent) +
labs(y = "Query", x = "Relative improvement in execution time (\\%)") +
scale_fill_discrete(labels = ggplot_case_labels)
scale_fill_discrete(labels = ggplot_case_labels) +
ggtitle(paste("Average Relative Improvement in DB Execution time per query with and without sec-RA rules (", toString(scale), "GB )"))
}
# 4. Stacked plot of mean value of all components
......@@ -57,15 +60,15 @@ stacked_means <- function() {
ggplot_theme +
labs(x = "Variant", y = "Time (ms)", fill = "Component") +
scale_x_discrete(labels = ggplot_case_labels) +
scale_fill_discrete(labels = c("RA", "A-priori", "Planning", "DB")) +
theme(legend.position="right", axis.text.x = element_text(angle = -45))
scale_fill_discrete(labels = c("RA", "A-priori", "Expand", "Planning", "DB")) +
theme(legend.position="right", axis.text.x = element_text(angle = -45)) +
ggtitle("Break down of total time per component")
# Add general facet scale label
z <- ggplotGrob(p)
z <- gtable_add_rows(z, unit(2, "line"), 2)
z <- gtable_add_grob(z, textGrob("Scale Factor", gp=gpar(fontsize=8)), 2, 13, 6, 5)
grid.newpage()
grid.draw(z)
return(z)
}
......
......@@ -2,13 +2,37 @@ get_results <- function(scale, filename) {
df <- read.csv(file.path("input", filename))
df <- df[!(is.na(df$case)),] # Remove verification lines
df <- df[!(df$case == 3),]
df <- df[!df$trial %in% c(0, 1, 2, 3, 4),]
df$scale <- factor(scale) # Add scale column
df$plan_and_db <- df$plan + df$db
df$query <- as.factor(as.numeric(str_extract(df$query, "[0-9]+")))
if(!"expand" %in% colnames(df))
{
df$expand <- 1
}
df$plan_and_db <- df$plan + df$db
df$case <- as.factor(df$case)
df <- df[, c(11, 1, 5, 3, 2, 4, 6, 7, 8, 9, 10, 12)]
df <- df[, c("scale", "query", "user_id", "case", "trial", "num_results", "ra", "apriori", "plan", "expand", "string", "db", "plan_and_db")]
df <- df[order(df$scale, df$query, df$user_id, df$case, df$trial), ]
remove_outliers <- function(x, na.rm = TRUE, ...) {
qnt <- quantile(x, probs=c(.25, .6), na.rm = na.rm, ...)
y <- x
y[x > qnt[2]] <- NA
y
}
add_new_column <- function(df) {
new <- remove_outliers(df$db)
return(cbind(new,df))
}
df_clean <- df %>%
group_by(scale, query, user_id, case) %>%
nest() %>%
mutate(data = map(data, add_new_column)) %>%
unnest(cols = c(data))
df <- df_clean[!is.na(df_clean$new),]
df <- select (df,-c(new))
return(df)
}
......@@ -27,44 +51,44 @@ get_stats <- function(results) {
means <- means[ordering, ]
}
get_difference <- function(diff_func) {
rel_diff <- ddply(results, .(scale, query, user_id, trial), numcolwise(diff_func))
mean_diff <- ddply(rel_diff, .(scale, query, user_id), numcolwise(mean))
mean_diff <- mean_diff[, c(1, 2, 3, 6, 7, 8, 9, 10, 11)]
return(mean_diff)
get_difference <- function(means, diff_func) {
rel_mean_diff <- ddply(means, .(scale, query, user_id), numcolwise(diff_func))
rel_mean_diff <- rel_mean_diff[, c("scale", "query", "user_id", "ra", "apriori", "plan", "expand", "string", "db")]
return(rel_mean_diff)
}
get_absolute_difference <- function() {
return(get_difference(diff))
get_absolute_difference <- function(means) {
return(get_difference(means, diff))
}
get_relative_difference <- function() {
return(get_difference(function(x) { return((x[1] - x[2]) / x[1]) }))
get_relative_difference <- function(means) {
return(get_difference(means, function(x) { return((x[1] - x[2]) / x[1]) }))
}
# ------ Calculate Mean Time stats ------
get_means <- function() {
means <- ddply(results, .(scale, query, user_id, case), numcolwise(mean))
means <- means[, c(1, 2, 3, 4, 7, 8, 9, 10, 11, 12)]
means <- means[, c("scale", "query", "user_id", "case", "ra", "apriori", "plan", "expand", "string", "db")]
means <- means[order(means$scale, means$query, means$user_id, means$case), ]
return(means)
}
get_means_user_id <- function(means) {
meansuserid <- ddply(means, .(scale, query, case), numcolwise(mean))
meansuserid <- meansuserid[, c(1, 2, 3, 5, 6, 7, 8, 9, 10)]
meansuserid <- meansuserid[, c("scale", "query", "case", "ra", "apriori", "plan", "expand", "string", "db")]
return(meansuserid)
}
get_means_user_id_query <- function(meansuserid) {
meansuseridquery <- ddply(meansuserid, .(scale, case), numcolwise(mean))
meansuseridquery <- meansuseridquery[, c(1, 2, 3, 4, 5, 7)]
meansuseridquery <- meansuseridquery[, c("scale", "case", "ra", "apriori", "plan", "expand", "db")]
meansuseridquery <- melt(meansuseridquery,id.vars = c("scale", "case"))
return(meansuseridquery)
}
abs_diff <- get_absolute_difference()
rel_diff <- get_relative_difference()
means <- get_means()
abs_diff <- get_absolute_difference(means)
rel_diff <- get_relative_difference(means)
meansuserid <- get_means_user_id(means)
meansuseridquery <- get_means_user_id_query(meansuserid)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment