Let’s simulate user visits with 4 different browser versions, 1 bad and 3 good. Main languages of users are de/fr/en and they don’t make a significant difference on conversion. The version does and we want to detect it using Pairwise Wilcoxon Rank Sum Test to identify pairs with different conversion rate and Chi-squared Test to identify significant variables (it’s version, not language).

library(ggplot2)
generateVisitors=function(version, lang,cnt, success_rte) {
  data.frame(version=version, lang=lang,success=rbinom(cnt,1,rep(success_rte,cnt)))
}
generateVisitorData=function(bad_version_visitors, total) {
  df=data.frame()
  df=rbind(df, generateVisitors("good-version.1", "de",round(total*0.45),0.12))
  df=rbind(df, generateVisitors("bad-version.2", "en",bad_version_visitors,0))
  df=rbind(df, generateVisitors("good-version.3", "en",round(total*0.34),0.15))
  df=rbind(df, generateVisitors("good-version.4", "fr",round(total*0.11),0.12))
  df
}
df=generateVisitorData(70,870)
df[sample(1:nrow(df), size = 20),]
##            version lang success
## 17  good-version.1   de       0
## 508 good-version.3   en       0
## 657 good-version.3   en       0
## 357 good-version.1   de       0
## 529 good-version.3   en       0
## 744 good-version.3   en       1
## 633 good-version.3   en       0
## 439  bad-version.2   en       0
## 420  bad-version.2   en       0
## 32  good-version.1   de       0
## 455  bad-version.2   en       0
## 246 good-version.1   de       0
## 198 good-version.1   de       0
## 384 good-version.1   de       0
## 823 good-version.4   fr       0
## 403  bad-version.2   en       0
## 239 good-version.1   de       0
## 451  bad-version.2   en       0
## 335 good-version.1   de       1
## 178 good-version.1   de       0
aggregate(success ~ lang, df, mean)
##   lang   success
## 1   de 0.1352041
## 2   en 0.1338798
## 3   fr 0.1458333
aggregate(success ~ version, df, mean)
##          version   success
## 1 good-version.1 0.1352041
## 2  bad-version.2 0.0000000
## 3 good-version.3 0.1655405
## 4 good-version.4 0.1458333
pairwise.wilcox.test(df$success, df$version)
## 
##  Pairwise comparisons using Wilcoxon rank sum test 
## 
## data:  df$success and df$version 
## 
##                good-version.1 bad-version.2 good-version.3
## bad-version.2  0.0044         -             -             
## good-version.3 0.8042         0.0016        -             
## good-version.4 1.0000         0.0044        1.0000        
## 
## P value adjustment method: holm
pairwise.wilcox.test(df$success, df$lang)
## 
##  Pairwise comparisons using Wilcoxon rank sum test 
## 
## data:  df$success and df$lang 
## 
##    de en
## en 1  - 
## fr 1  1 
## 
## P value adjustment method: holm
chisq.test(table(df$version,df$success))
## 
##  Pearson's Chi-squared test
## 
## data:  table(df$version, df$success)
## X-squared = 13.312, df = 3, p-value = 0.004009
chisq.test(table(df$lang,df$success))
## 
##  Pearson's Chi-squared test
## 
## data:  table(df$lang, df$success)
## X-squared = 0.095007, df = 2, p-value = 0.9536

Now let’s see how many visitors in total and with a bad version we need to detect problems with max p-value of 0.05 across pairs badVersion and goodVersion1,3,4.

ret=data.frame()
for (bad_visitors in c(5,10,20,30,40,60,80,100)) {
  for (total_visitors in c(50,100,200,300,400,600,800)) {
    df=generateVisitorData(bad_visitors, total_visitors)
    p=pairwise.wilcox.test(df$success, df$version)
    ret=rbind(ret, data.frame(bad_visitors=bad_visitors, total_visitors=total_visitors, maxP=max(c(p$p.value[1,1], p$p.value[2:3,2]))))
  }
}

ggplot(ret, aes(x=total_visitors, y=bad_visitors, colour=maxP<0.05))+geom_point()