Load packages
library(mosaic)
library(dplyr)
library(ggplot2)
library(ggvis)
library(parallel)
Chi-square goodness-of-fit: Benford’s Law
## Load Benford's Law package and
## Sino Forest Data
library(benford.analysis)
sino.forest <- read.csv("http://www.bradthiessen.com/html5/data/sinoforest.csv")
## Extract first digit
first_digit <- extract.digits(sino.forest$value, number.of.digits = 1,
sign="positive", second.order = FALSE, discrete=TRUE, round=3)
## Get frequencies
first_digit %>%
group_by(data.digits) %>%
summarize(n=n(), rel_freq=n()/772)
## Source: local data frame [9 x 3]
##
## data.digits n rel_freq
## 1 1 231 0.29922280
## 2 2 124 0.16062176
## 3 3 97 0.12564767
## 4 4 70 0.09067358
## 5 5 64 0.08290155
## 6 6 54 0.06994819
## 7 7 40 0.05181347
## 8 8 54 0.06994819
## 9 9 38 0.04922280
# Run the chi-squared analysis
benford <- benford(first_digit$data.digits, 1, sign="both") #generates benford object
benford #prints
##
## Benford object:
##
## Data: first_digit$data.digits
## Number of observations used = 772
## Number of obs. for second order = 8
## First digits analysed = 1
##
## Mantissa:
##
## [1] 0.43 0.11 -1.40 -0.02
##
## The 5 largest deviations:
##
## digits absolute.diff
## 1 8 14.51
## 2 2 11.94
## 3 4 4.81
## 4 7 4.77
## 5 5 2.87
##
## Stats:
##
## Pearson's Chi-squared test
##
## data: first_digit$data.digits
## X-squared = 7.6517, df = 8, p-value = 0.4682
##
##
## Mantissa Arc Test
##
## data: first_digit$data.digits
## L2 = 0.0458, df = 2, p-value = 4.304e-16
##
## Mean Absolute Deviation: 0.006598135
## Distortion Factor: -8.934679
##
## Remember: Real data will never conform perfectly to Benford's Law. You should not focus on p-values!
plot(benford) #plots
## Roll a die 120 times
die <- sample(1:6, 120, replace=T)
table(die)
## die
## 1 2 3 4 5 6
## 22 14 21 25 17 21