require(mosaic)
require(dplyr)
require(ggplot2)
require(RSQLite)
require(pitchRx)
## Can update pitchRx with devtools::install_github("cpsievert/pitchRx")
setwd("~/Desktop/R Sandbox/Baseball")
## Create new database: db = MaxScherzer.sqlite3
db <- src_sqlite("MaxScherzer.sqlite3", create = TRUE)
## Scrape data from a range of dates
## Could also use Game ID with...
### GameData <- scrape(game.ids="gid_2011_05_07_detmlb_tormlb_1", suffix = "inning/inning_all.xml")
## This gets us 5 dataframes: atbat, action, pitch, po, runner
scrape(start = "2014-06-12", end = "2014-06-12", connect = db$con)
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_lanmlb_cinmlb_1/inning/inning_all.xml
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_sdnmlb_phimlb_1/inning/inning_all.xml
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_atlmlb_colmlb_1/inning/inning_all.xml
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_wasmlb_sfnmlb_1/inning/inning_all.xml
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_chnmlb_pitmlb_1/inning/inning_all.xml
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_tormlb_balmlb_1/inning/inning_all.xml
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_clemlb_bosmlb_1/inning/inning_all.xml
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_milmlb_nynmlb_1/inning/inning_all.xml
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_arimlb_houmlb_1/inning/inning_all.xml
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_detmlb_chamlb_1/inning/inning_all.xml
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_nyamlb_seamlb_1/inning/inning_all.xml
## NULL
atbats <- tbl(db, 'atbat') %.%
filter(date >= '2014_06_12' & date <= '2014_06_12') %.%
filter(pitcher_name == 'Max Scherzer')
pitches <- tbl(db, 'pitch')
MaxScherzerJune12 <- collect(inner_join(pitches, atbats, by = c('num', 'gameday_link')))
MaxS <- collect(MaxScherzerJune12)
## Select the atbat dataframe
# atbats <- tbl(db, 'atbat')
## Select the pitch table
# pitches <- tbl(db, 'pitch')
## Filter pitches thrown by Max Scherzer
# max <- filter(atbats, pitcher_name == 'Max Scherzer')
## Query the database to load this data
# dbSendQuery(db$con, 'CREATE INDEX pitcher_idx ON atbat(pitcher_name)')
## Join the atbat table with the pitches table
# MaxScherzerJune12 <- inner_join(pitches, max, by = c('num', 'gameday_link'))
## Query the database
# dbSendQuery(db$con, 'CREATE INDEX pitch_idx ON pitch(gameday_link, num)')
## Collect the joined tables into a dataframe
# MaxS <- collect(MaxScherzerJune12)
## Density - all pitches
strikeFX(MaxS, geom="tile", layer=facet_grid(.~stand))
## Warning: Removed 16 rows containing non-finite values (stat_density2d).
## Warning: Removed 12 rows containing non-finite values (stat_density2d).
## Just the called strikes
Maxstrikes <- subset(MaxS, des == "Called Strike")
strikeFX(Maxstrikes, geom="tile", layer=facet_grid(.~stand))
## Just the swinging strikes
Maxswingstrikes <- subset(MaxS, des == "Swinging Strike")
strikeFX(Maxswingstrikes, geom="tile", layer=facet_grid(.~stand))
## Just the balls
Maxballs <- subset(MaxS, des == "Ball")
strikeFX(Maxballs, geom="tile", layer=facet_grid(.~stand))
## Warning: Removed 4 rows containing non-finite values (stat_density2d).
## Warning: Removed 8 rows containing non-finite values (stat_density2d).
## The probability of a strike based on location
noswing <- subset(MaxS, des %in% c("Ball", "Called Strike"))
noswing$strike <- as.numeric(noswing$des %in% "Called Strike")
require(mgcv)
m1 <- bam(strike ~ s(px, pz, by=factor(stand)) +
factor(stand), data=noswing, family = binomial(link='logit'))
strikeFX(noswing, model=m1, layer=facet_grid(.~stand))
## Animated pitches (averaged by pitch type)
library(animation)
saveHTML(
animateFX(MaxS, avg.by = 'pitch_types', layer = list(theme_bw(), facet_grid(.~stand))),
img.name = "MaxS"
)
require(mosaic)
require(dplyr)
require(ggplot2)
require(pitchRx)
require(plyr)
## Can update pitchRx with devtools::install_github("cpsievert/pitchRx")
setwd("~/Desktop/R Sandbox/Baseball")
# Scrape data (can use GameID or range of dates)
GameData <- scrape(game.ids="gid_2011_05_07_detmlb_tormlb_1", suffix = "inning/inning_all.xml")
## http://gd2.mlb.com/components/game/mlb/year_2011/month_05/day_07/gid_2011_05_07_detmlb_tormlb_1/inning/inning_all.xml
# Combine pitch and at-bat data
pitchFX <- join(GameData$pitch, GameData$atbat, by = c("num", "url"), type = "inner")
# This creates a dataframe with 69 columns
names(pitchFX)
## [1] "des" "des_es" "id"
## [4] "type" "tfs" "tfs_zulu"
## [7] "x" "y" "cc"
## [10] "mt" "url" "inning_side"
## [13] "inning" "next_" "num"
## [16] "sv_id" "start_speed" "end_speed"
## [19] "sz_top" "sz_bot" "pfx_x"
## [22] "pfx_z" "px" "pz"
## [25] "x0" "y0" "z0"
## [28] "vx0" "vy0" "vz0"
## [31] "ax" "ay" "az"
## [34] "break_y" "break_angle" "break_length"
## [37] "pitch_type" "type_confidence" "zone"
## [40] "nasty" "spin_dir" "spin_rate"
## [43] "on_1b" "on_2b" "on_3b"
## [46] "gameday_link" "count" "pitcher"
## [49] "batter" "b" "s"
## [52] "o" "start_tfs" "start_tfs_zulu"
## [55] "stand" "b_height" "p_throws"
## [58] "atbat_des" "atbat_des_es" "event"
## [61] "inning_side" "inning" "next_"
## [64] "score" "home_team_runs" "away_team_runs"
## [67] "batter_name" "pitcher_name" "gameday_link"
## [70] "date"
# Keep only the pitches thrown by Justin Verlander
pitches <- subset(pitchFX, pitcher_name == "Justin Verlander")
# Graph all the pitches thrown by JV
strikeFX(pitches, geom="tile", layer=facet_grid(.~stand))
## Warning: Removed 1 rows containing non-finite values (stat_density2d).
## Warning: Removed 2 rows containing non-finite values (stat_density2d).
The above graph shows the location of all the pitches Justin Verlander threw that day (to right- and left-handed batters). Let’s look at just the called strikes:
strikes <- subset(pitches, des == "Called Strike")
strikeFX(strikes, geom="tile", layer=facet_grid(.~stand))
… and the swinging strikes:
swingstrikes <- subset(pitches, des == "Swinging Strike")
strikeFX(swingstrikes, geom="tile", layer=facet_grid(.~stand))
… and the balls:
balls <- subset(pitches, des == "Ball")
strikeFX(balls, geom="tile", layer=facet_grid(.~stand))
## Warning: Removed 1 rows containing non-finite values (stat_density2d).
## Warning: Removed 2 rows containing non-finite values (stat_density2d).
Finally, here’s an animation of all the pitches from Justin Verlander during this game:
animateFX(pitches, layer=list(facet_grid(pitcher_name~stand, labeller = label_both), theme_bw(), coord_equal()))
require(animation)
ani.options(interval = 0.05)
saveHTML({animateFX(pitches, layer=list(facet_grid(pitcher_name~stand, labeller = label_both), theme_bw(), coord_equal()))}, img.name="JVpitches")
# The Sxslt package is required in order to download new game data from MLBAM
install.packages("Sxslt", repos = "http://www.omegahat.org/R", type = "source")
# Install openWAR package
require(devtools)
install_github("openWAR", "beanumber")
# Load package
require(openWAR)
## Get GameIDs for a particular date
getGameIds(date=as.Date("2014-06-12"))
## "gid_2014_06_12_detmlb_chamlb_1" is a DET vs CLE game
## Load data
data(MLBAM2013)
## Get MLBAM player ID for a particular player
playerNames <- unique(MLBAM2013$batterName)
playerNames[grep("Jackson", playerNames)]
## Austin Jackson is "Jackson, A"
jacksonId = unique(subset(MLBAM2013, batterName == "Jackson, A")$batterId)
jackson = subset(MLBAM2013, playerId.CF == jacksonId)
## Calculate the number of flyballs caught by Jackson in CF in 2013
require(mosaic)
head(sort(tally(~event, data=subset(jackson, fielderId == playerId.CF)), decreasing=TRUE))
## Other statistics, like
#### groundout-to-air out ratio of the Tigers;
#### batting average on balls in play of their opponents.
require(plyr)
ddply(jackson, ~playerId.CF, summarise,
N = length(playerId.CF), G = length(unique(gameId)),
BIP = sum(isBIP), PO = sum(fielderId == playerId.CF, na.rm=TRUE),
"GO/AO" = length(grep("Ground", event)) / length(grep("Fly", event)),
BABIP = sum(isHit) / sum(isAB)
)
## Visualize Jackson's catches on the field
plot(subset(jackson, fielderId == playerId.CF))
## Hexbin plot
require(hexbin)
my.colors <- function (n) {
rev(heat.colors(n))
}
hexbinplot(our.y ~ our.x, data=subset(jackson, fielderId == playerId.CF), xbins = 10
, panel = function(x,y, ...) {
panel.baseball()
panel.hexbinplot(x,y, ...)
}
, xlim = c(-350, 350), ylim = c(-20, 525)
, xlab = "Horizontal Distance from Home Plate (ft.)"
, ylab = "Vertical Distance from Home Plate (ft.)"
, colramp = my.colors, colorcut = seq(0, 1, length = 10)
)
## Comparing Jackson to all other CF players
hexbinplot(our.y ~ our.x, data=subset(MLBAM2013, fielderId == playerId.CF), xbins = 50
, panel = function(x,y, ...) {
panel.baseball()
panel.hexbinplot(x,y, ...)
}
, xlim = c(-350, 350), ylim = c(-20, 525)
, xlab = "Horizontal Distance from Home Plate (ft.)"
, ylab = "Vertical Distance from Home Plate (ft.)"
, colramp = my.colors, colorcut = seq(0, 1, length = 10)
)
## Compare Jackson to a selected group of players
key = unique(subset(MLBAM2013, batterName %in% c("Trout", "Upton, B", "Jackson, A", "Ellsbury"), select=c("batterId", "batterName")))
comp = subset(MLBAM2013, playerId.CF %in% key$batterId & fielderId == playerId.CF)
hexbinplot(our.y ~ our.x | as.factor(playerId.CF), data=comp, xbins = 10
, panel = function(x,y, ...) {
panel.baseball()
panel.hexbinplot(x,y, ...)
}
, xlim = c(-350, 350), ylim = c(-20, 525)
, xlab = "Horizontal Distance from Home Plate (ft.)"
, ylab = "Vertical Distance from Home Plate (ft.)"
, colramp = my.colors, colorcut = seq(0, 1, length = 10)
, strip = strip.custom(factor.levels = as.character(key$batterName))
)
************************************** MARCEL: http://www.tangotiger.net/marcel/ **************************************
require(Lahman)
## We'll calculate MARCEL for 2004
pred.year = 2004
## Collect and aggregate data for 2001-03 (adding plate appearances)
library(dplyr)
data(Batting)
A <- Batting %.%
group_by(playerID, yearID) %.%
filter(yearID >= pred.year - 3 & yearID < pred.year) %.%
select(playerID, yearID, AB, R, H, X2B, X3B, HR, RBI, SB, CS, BB, SO, IBB, HBP, SH, SF, GIDP) %.%
mutate(PA = AB + BB + HBP + SF + SH) %.%
summarize(PA = sum(PA), AB = sum(AB), R = sum(R),
H = sum(H), X2B = sum(X2B), X3B = sum(X3B), HR = sum(HR),
RBI = sum(RBI), SB = sum(SB), CS = sum(CS), BB = sum(BB),
SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP), SH = sum(SH),
SF = sum(SF), GIDP = sum(GIDP))
## We could also use plyr for this
# B = subset(Batting, yearID >= pred.year - 3 & yearID < pred.year)
# B = transform(B, PA = AB + BB + HBP + SF + SH)
# require(plyr)
# stats = c("PA", "AB", "R", "H", "X2B", "X3B", "HR", "RBI", "SB",
# "CS", "BB", "SO", "IBB", "HBP", "SH", "SF", "GIDP")
# B = ddply(B[, c("playerID", "yearID", stats)], ~playerID + yearID, summarise
# , PA = sum(PA), AB = sum(AB), R = sum(R)
# , H = sum(H), X2B = sum(X2B), X3B = sum(X3B), HR = sum(HR)
# , RBI = sum(RBI), SB = sum(SB), CS = sum(CS), BB = sum(BB)
# , SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP), SH = sum(SH)
# , SF = sum(SF), GIDP = sum(GIDP))
## Carlos Beltran home run projection
## Get HR and PA totals
x = subset(A, playerID == "beltrca01")$HR
x
## Plate appearances
n = subset(A, playerID == "beltrca01")$PA
n
## Using dplyr results in data.frames
# x <- A %.%
# filter(playerID == "beltrca01"); q$HR
# n <- A %.%
# filter(playerID == "beltrca01"); q$PA
## MARCEL weighs statistics over time
## The season immediately preceding the current one is given
## a weight of 5; the season before that=4; the season two before=3.
## Create a vector, t, with these weights
A$t = with(A, ifelse(pred.year - yearID == 1, 5, ifelse(pred.year - yearID == 2, 4, 3)))
## Beltran’s time-weighted home run totals is an inner product
t = subset(A, playerID == "beltrca01")$t
t
t %*% x
## remove pitchers from before we compute the league averages
P = subset(Pitching, yearID >= pred.year - 3 & yearID < pred.year)
A.pos = subset(merge(x=A, y=P[, c("playerID", "yearID", "BFP")], by=c("playerID", "yearID"), all.x=TRUE), PA > BFP | is.na(BFP))
## League averages are computed by simply summing over all the rows
p0 <- A.pos %.%
group_by(yearID) %.%
summarize(lgPA = sum(PA), lgAB = sum(AB)/sum(PA), lgR = sum(R)/sum(PA),
lgH = sum(H)/sum(PA), lgX2B = sum(X2B)/sum(PA), lgX3B = sum(X3B)/sum(PA),
lgHR = sum(HR)/sum(PA), lgRBI = sum(RBI)/sum(PA), lgSB = sum(SB)/sum(PA),
lgCS = sum(CS)/sum(PA), lgBB = sum(BB)/sum(PA), lgSO = sum(SO)/sum(PA),
lgIBB = sum(IBB)/sum(PA), lgHBP = sum(HBP)/sum(PA), lgSH = sum(SH)/sum(PA),
lgSF = sum(SF)/sum(PA), lgGIDP = sum(GIDP)/sum(PA))
p0[, c("yearID", "lgHR")]
## Using plyr
# p0 = ddply(B.pos, ~yearID, summarise, lgPA = sum(PA), lgAB = sum(AB)/sum(PA), lgR = sum(R)/sum(PA)
# , lgH = sum(H)/sum(PA), lgX2B = sum(X2B)/sum(PA), lgX3B = sum(X3B)/sum(PA), lgHR = sum(HR)/sum(PA)
# , lgRBI = sum(RBI)/sum(PA), lgSB = sum(SB)/sum(PA), lgCS = sum(CS)/sum(PA), lgBB = sum(BB)/sum(PA)
# , lgSO = sum(SO)/sum(PA), lgIBB = sum(IBB)/sum(PA), lgHBP = sum(HBP)/sum(PA), lgSH = sum(SH)/sum(PA)
# , lgSF = sum(SF)/sum(PA), lgGIDP = sum(GIDP)/sum(PA))
# p0[, c("yearID", "lgHR")]
## Calculating MARCEL
# Merge league averages with player statistics
M = merge(x=A, y=p0, by="yearID")
# List of statistics to get projected
stats.lg = paste("lg", stats, sep="")
# X is a matrix of statistics
X = M[, stats]
# P0 is a matrix of league average statistics
P0 = M[, stats.lg]
# Carry out the calculations
t.X = M$t * X
t.n = M$t * M$PA
t.n0 = M$t * 100
t.P0 = M$t * M$PA * P0
# add Tango’s estimate for each player’s plate appearance projection
mPA = with(M, ifelse(pred.year - yearID == 1, 0.5 * PA, ifelse(pred.year - yearID == 2, 0.1 * PA, 200)))
# add these to our data frame and aggregate it by player
Q = cbind(M[, c("playerID", "yearID")], t.n, t.X, t.n0, t.P0, mPA)
## Using plyr
res = ddply(Q, ~playerID, summarise, numSeasons = length(t.n), reliability = sum(t.n)/(sum(t.n) + sum(t.n0))
, tn = sum(t.n)
, PA = sum(PA), lgPA = sum(lgPA), mPA = sum(mPA)
, AB = sum(AB), lgAB = sum(lgAB), R = sum(R), lgR = sum(lgR)
, H = sum(H), lgH = sum(lgH), X2B = sum(X2B), lgX2B = sum(lgX2B)
, X3B = sum(X3B), lgX3B = sum(lgX3B), HR = sum(HR), lgHR = sum(lgHR)
, RBI = sum(RBI), lgRBI = sum(lgRBI), SB = sum(SB), lgSB = sum(lgSB)
, CS = sum(CS), lgCS = sum(lgCS), BB = sum(BB), lgBB = sum(lgBB)
, SO = sum(SO), lgSO = sum(lgSO), IBB = sum(IBB), lgIBB = sum(lgIBB)
, HBP = sum(HBP), lgHBP = sum(lgHBP), SH = sum(SH), lgSH = sum(lgSH)
, SF = sum(SF), lgSF = sum(lgSF), GIDP = sum(GIDP), lgGIDP = sum(lgGIDP))
## Using dplyr
# res <- Q %.%
# group_by(playerID) %.%
# mutate(numSeasons = length(t.n), reliability = sum(t.n)/(sum(t.n) + sum(t.n0))) %.%
# summarize(tn = sum(t.n)
# , PA = sum(PA), lgPA = sum(lgPA), mPA = sum(mPA)
# , AB = sum(AB), lgAB = sum(lgAB), R = sum(R), lgR = sum(lgR)
# , H = sum(H), lgH = sum(lgH), X2B = sum(X2B), lgX2B = sum(lgX2B)
# , X3B = sum(X3B), lgX3B = sum(lgX3B), HR = sum(HR), lgHR = sum(lgHR)
# , RBI = sum(RBI), lgRBI = sum(lgRBI), SB = sum(SB), lgSB = sum(lgSB)
# , CS = sum(CS), lgCS = sum(lgCS), BB = sum(BB), lgBB = sum(lgBB)
# , SO = sum(SO), lgSO = sum(lgSO), IBB = sum(IBB), lgIBB = sum(lgIBB)
# , HBP = sum(HBP), lgHBP = sum(lgHBP), SH = sum(SH), lgSH = sum(lgSH)
# , SF = sum(SF), lgSF = sum(lgSF), GIDP = sum(GIDP), lgGIDP = sum(lgGIDP))
## The MARCEL projectiions are then the weighted averages
## of the player’s stats and the league averages,
## with the weights given by the reliability
stats.proj = setdiff(stats, "PA")
stats.m = paste("m", stats.proj, sep="")
stats.lg.proj = paste("lg", stats.proj, sep="")
res[, stats.m] = with(res, (reliability * res[, stats.proj]) / tn + (1 - reliability) * res[, stats.lg.proj] / tn)
subset(res, playerID == "beltrca01", select=c("reliability", "tn", "mPA", "HR", "lgHR", "mHR"))
res = merge(x=res, y=Master[,c("playerID", "birthYear")], by="playerID")
res = transform(res, age = pred.year - birthYear)
res$age.adj = with(res, ifelse(age > 29, 0.003 * (age - 29), 0.006 * (age - 29)))
res[, stats.m] = res[, stats.m] * (1 + res$age.adj)
subset(res, playerID == "beltrca01", select=c("reliability", "age.adj", "tn", "mPA", "HR", "lgHR", "mHR"))
************************************** Top batting averages over time **************************************
### Source: http://lahman.r-forge.r-project.org/doc/Batting.html
## Create a plot of top batting averages over time
# Loads a baseball dataset
library(Lahman)
# Loads batting data
data(Batting)
# Loads a package that helps manipulate/reshape data
require('plyr')
# Look at the first several rows of data
head(Batting)
## playerID yearID stint teamID lgID G G_batting AB R H X2B X3B HR RBI SB
## 1 aardsda01 2004 1 SFN NL 11 11 0 0 0 0 0 0 0 0
## 2 aardsda01 2006 1 CHN NL 45 43 2 0 0 0 0 0 0 0
## 3 aardsda01 2007 1 CHA AL 25 2 0 0 0 0 0 0 0 0
## 4 aardsda01 2008 1 BOS AL 47 5 1 0 0 0 0 0 0 0
## 5 aardsda01 2009 1 SEA AL 73 3 0 0 0 0 0 0 0 0
## 6 aardsda01 2010 1 SEA AL 53 4 0 0 0 0 0 0 0 0
## CS BB SO IBB HBP SH SF GIDP G_old
## 1 0 0 0 0 0 0 0 0 11
## 2 0 0 0 0 0 1 0 0 45
## 3 0 0 0 0 0 0 0 0 2
## 4 0 0 1 0 0 0 0 0 5
## 5 0 0 0 0 0 0 0 0 NA
## 6 0 0 0 0 0 0 0 0 NA
# calculate batting average and other stats
batting <- battingStats()
# add salary to Batting data; need to match by player, year and team
batting <- merge(batting,
Salaries[,c("playerID", "yearID", "teamID", "salary")],
by=c("playerID", "yearID", "teamID"), all.x=TRUE)
# Add name, age and bat hand information:
masterInfo <- Master[, c('playerID', 'birthYear', 'birthMonth',
'nameLast', 'nameFirst', 'bats')]
batting <- merge(batting, masterInfo, all.x = TRUE)
batting$age <- with(batting, yearID - birthYear -
ifelse(birthMonth < 10, 0, 1))
batting <- arrange(batting, playerID, yearID, stint)
## Generate a plot of batting average over time
# Restrict the pool of eligible players to the years after 1899 and
# players with a minimum of 450 plate appearances (this covers the
# strike year of 1994 when Tony Gwynn hit .394 before play was suspended
# for the season - in a normal year, the minimum number of plate appearances is 502)
eligibleHitters <- subset(batting, yearID >= 1900 & PA > 450)
# Find the hitters with the highest BA in MLB each year (there are a
# few ties). Include all players with BA > .400
topHitters <- ddply(eligibleHitters, .(yearID), subset, (BA == max(BA))|BA > .400)
# Create a factor variable to distinguish the .400 hitters
topHitters$ba400 <- with(topHitters, BA >= 0.400)
# Sub-data frame for the .400 hitters plus the outliers after 1950
# (averages above .380) - used to produce labels in the plot below
bignames <- rbind(subset(topHitters, ba400),
subset(topHitters, yearID > 1950 & BA > 0.380))
# Cut to the relevant set of variables
bignames <- subset(bignames, select = c('playerID', 'yearID', 'nameLast',
'nameFirst', 'BA'))
# Ditto for the original data frame
topHitters <- subset(topHitters, select = c('playerID', 'yearID', 'BA', 'ba400'))
# Positional offsets to spread out certain labels
# NL TC JJ TC GS TC RH GS HH RH RH BT TW TW RC GB TG
bignames$xoffset <- c(0, 0, 0, 0, 0, 0, 0, 0, -8, 0, 3, 3, 0, 0, -2, 0, 0)
bignames$yoffset <- c(0, 0, -0.003, 0, 0, 0, 0, 0, -0.004, 0, 0, 0, 0, 0, -0.003, 0, 0) + 0.002
# Load package for creating visualizations
require('ggplot2')
# Create the visualization
ggplot(topHitters, aes(x = yearID, y = BA)) +
geom_point(aes(colour = ba400), size = 2.5) +
geom_hline(yintercept = 0.400, size = 1) +
geom_text(data = bignames, aes(x = yearID + xoffset, y = BA + yoffset,
label = nameLast), size = 3) +
scale_colour_manual(values = c('FALSE' = 'black', 'TRUE' = 'red')) +
ylim(0.330, 0.430) +
xlab('Year') +
scale_y_continuous('Batting average',
breaks = seq(0.34, 0.42, by = 0.02),
labels = c('.340', '.360', '.380', '.400', '.420')) +
geom_smooth() +
theme(legend.position = 'none')
************************************** Homerun Trend **************************************
### Source: http://lahman.r-forge.r-project.org/doc/Batting.html
# Total home runs by year
totalHR <- ddply(Batting, .(yearID), summarise,
HomeRuns = sum(as.numeric(HR), na.rm=TRUE),
Games = sum(as.numeric(G_batting), na.rm=TRUE),
HRperGame = HomeRuns/Games
)
totalHR <- totalHR[ which(totalHR$Games>0), ]
# Quick look at the data
head(totalHR)
## yearID HomeRuns Games HRperGame
## 1 1871 47 2296 0.020470
## 2 1872 35 3307 0.010584
## 3 1873 46 3603 0.012767
## 4 1874 40 4199 0.009526
## 5 1875 40 6249 0.006401
## 6 1876 40 4696 0.008518
# Plot trend (total homeruns / total games played)
# Add lowess smoothed line to see trend
ggplot(totalHR, aes(x=yearID, y=HRperGame)) +
geom_point(shape=1, alpha=0.8) + # Use hollow circles
geom_smooth(alpha=0.3) # Add a loess smoothed fit curve with confidence region
************************************** dplyr demonstrations **************************************
### Lists datasets available in the Lahman package
data(package="Lahman")
# Loads Batting data
data(Batting)
# Gets dimensions of Batting data
dim(Batting)
## [1] 96600 24
This Batting dataset has 96,600 rows and 24 columns. Let’s get an idea of what those rows and columns represent:
head(Batting, 8)
## playerID yearID stint teamID lgID G G_batting AB R H X2B X3B HR
## 1 aardsda01 2004 1 SFN NL 11 11 0 0 0 0 0 0
## 2 aardsda01 2006 1 CHN NL 45 43 2 0 0 0 0 0
## 3 aardsda01 2007 1 CHA AL 25 2 0 0 0 0 0 0
## 4 aardsda01 2008 1 BOS AL 47 5 1 0 0 0 0 0
## 5 aardsda01 2009 1 SEA AL 73 3 0 0 0 0 0 0
## 6 aardsda01 2010 1 SEA AL 53 4 0 0 0 0 0 0
## 7 aardsda01 2012 1 NYA AL 1 NA NA NA NA NA NA NA
## 8 aaronha01 1954 1 ML1 NL 122 122 468 58 131 27 6 13
## RBI SB CS BB SO IBB HBP SH SF GIDP G_old
## 1 0 0 0 0 0 0 0 0 0 0 11
## 2 0 0 0 0 0 0 0 1 0 0 45
## 3 0 0 0 0 0 0 0 0 0 0 2
## 4 0 0 0 0 1 0 0 0 0 0 5
## 5 0 0 0 0 0 0 0 0 0 0 NA
## 6 0 0 0 0 0 0 0 0 0 0 NA
## 7 NA NA NA NA NA NA NA NA NA NA NA
## 8 69 2 2 28 39 NA 3 6 4 13 122
Each row represents the batting statistics for a baseball player in a single year playing for a single team. So, for example, if a player played for 2 teams in a single season, that player would have two rows (one for each “stint”).
From the header displayed above, we can see the player “aardsda01” has data from the 2004-2012 seasons (where he played for San Francisco, both Chicago teams, Boston, Seattle, and the Yankees).
Suppose I want to know which 5 players in MLB history have batted in the most games. To do this, I need to add up the games played (G) for all the rows for each player.
The dplyr package allows us to do these kind of data manipulations easily. We want to:
Group the rows of the dataset by playerID
Summarize those groups by adding up the games played
Arrange the data by listing the 5 players with the most games played
To do this step-by-step, we would use:
## Group rows of data by playerID
players <- group_by(Batting, playerID)
## Summarize the groups by taking the sum of games played
games <- summarize(players, total = sum(G))
## Arrange the data by games played (in descending order) and list the top 5
head(arrange(games, desc(total)), 5)
## total
## 1 4988101
Those commands took a small fraction of a second to complete. From the output, we can see the top five players with most games played are: Pete Rose, Carl Yastrzemski, Hank Aaron, Ricky Henderson, and Ty Cobb.
We could have also done this manipulation by chaining operations together with the %.% operator:
Batting %.%
group_by(playerID) %.%
summarize(total = sum(G)) %.%
arrange(desc(total)) %.%
head(5)
## total
## 1 4988101
You can think of %.% as an arrow pointing to the right –>. It tells the computer to move on to the next command in the pipeline.
Let’s see some other manipulation commands we may want to use:
This Batting dataset has 24 variables. Suppose we’re only interested in 14 of those columns: playerID, yearID, teamID, at-bats, hits, doubles, triples, homeruns, stolen bases, number of times caught stealing, strikeouts, walks, hit-by-pitch numbers, and sacrifice flies.
We can select a subset of columns from our dataset with the select command:
Batting %.%
select(playerID, yearID, teamID, AB, H, X2B, X3B, HR, SB, CS, SO, BB, HBP, SF) %.%
head(10)
## playerID yearID teamID AB H X2B X3B HR SB CS SO BB HBP SF
## 1 aardsda01 2004 SFN 0 0 0 0 0 0 0 0 0 0 0
## 2 aardsda01 2006 CHN 2 0 0 0 0 0 0 0 0 0 0
## 3 aardsda01 2007 CHA 0 0 0 0 0 0 0 0 0 0 0
## 4 aardsda01 2008 BOS 1 0 0 0 0 0 0 1 0 0 0
## 5 aardsda01 2009 SEA 0 0 0 0 0 0 0 0 0 0 0
## 6 aardsda01 2010 SEA 0 0 0 0 0 0 0 0 0 0 0
## 7 aardsda01 2012 NYA NA NA NA NA NA NA NA NA NA NA NA
## 8 aaronha01 1954 ML1 468 131 27 6 13 2 2 39 28 3 4
## 9 aaronha01 1955 ML1 602 189 37 9 27 3 1 61 49 3 4
## 10 aaronha01 1956 ML1 609 200 34 14 26 2 4 54 37 2 7
As you can see, our dataset now has these 14 columns. Suppose we wanted to add some new columns:
Batting average = hits / at-bats
On-base percentage = (hits + walks + hit-by-pitch) / (at-bats + walks + hbp + sacrifice flies)
Slugging percentage = (singles + 2doubles + 3triples + 4HRs) / (at-bats)
It’s easy to add columns using the mutate command in dplyr:
## Add new columns (BA, OBP, SLG)
Batting %.%
select(playerID, yearID, teamID, AB, H, X2B, X3B, HR, SB, CS, SO, BB, HBP, SF) %.%
mutate(Avg = H/AB,
OBP = (H + BB + HBP)/(AB+BB+HBP+SF),
SLG = (((H-X2B-X3B-HR)+(2*X2B)+(3*X3B)+(4*HR))/AB)) %.%
head(12)
## playerID yearID teamID AB H X2B X3B HR SB CS SO BB HBP SF Avg
## 1 aardsda01 2004 SFN 0 0 0 0 0 0 0 0 0 0 0 NaN
## 2 aardsda01 2006 CHN 2 0 0 0 0 0 0 0 0 0 0 0.0000
## 3 aardsda01 2007 CHA 0 0 0 0 0 0 0 0 0 0 0 NaN
## 4 aardsda01 2008 BOS 1 0 0 0 0 0 0 1 0 0 0 0.0000
## 5 aardsda01 2009 SEA 0 0 0 0 0 0 0 0 0 0 0 NaN
## 6 aardsda01 2010 SEA 0 0 0 0 0 0 0 0 0 0 0 NaN
## 7 aardsda01 2012 NYA NA NA NA NA NA NA NA NA NA NA NA NA
## 8 aaronha01 1954 ML1 468 131 27 6 13 2 2 39 28 3 4 0.2799
## 9 aaronha01 1955 ML1 602 189 37 9 27 3 1 61 49 3 4 0.3140
## 10 aaronha01 1956 ML1 609 200 34 14 26 2 4 54 37 2 7 0.3284
## 11 aaronha01 1957 ML1 615 198 27 6 44 1 1 58 57 0 3 0.3220
## 12 aaronha01 1958 ML1 601 196 34 4 30 4 1 49 59 1 3 0.3261
## OBP SLG
## 1 NaN NaN
## 2 0.0000 0.0000
## 3 NaN NaN
## 4 0.0000 0.0000
## 5 NaN NaN
## 6 NaN NaN
## 7 NA NA
## 8 0.3221 0.4466
## 9 0.3663 0.5399
## 10 0.3649 0.5583
## 11 0.3778 0.6000
## 12 0.3855 0.5458
As you can see, the command worked for rows 8-12. We can see, for example, that Hank Aaron (aaronha01) batted .279 in 1954. We can also see the command gave us “NaN” and “NA” in several of those rows.
The NA represents missing data, while NaN means “not a number.” We got NaN results when we tried to divide by zero in our calculations. For example, the first row lists a player with no at-bats. When we tried to calculate a batting average for that row, the calculation was 0/0 = NaN. To get around this problem, we can filter the dataset to include only players who have at least one at-bat in a season:
## Add new columns (BA, OBP, SLG)
Batting %.%
select(playerID, yearID, teamID, AB, H, X2B, X3B, HR, SB, CS, SO, BB, HBP, SF) %.%
filter(AB>0) %.%
mutate(Avg = H/AB,
OBP = (H + BB + HBP)/(AB+BB+HBP+SF),
SLG = (((H-X2B-X3B-HR)+(2*X2B)+(3*X3B)+(4*HR))/AB)) %.%
head(5)
## playerID yearID teamID AB H X2B X3B HR SB CS SO BB HBP SF Avg
## 1 aardsda01 2006 CHN 2 0 0 0 0 0 0 0 0 0 0 0.0000
## 2 aardsda01 2008 BOS 1 0 0 0 0 0 0 1 0 0 0 0.0000
## 3 aaronha01 1954 ML1 468 131 27 6 13 2 2 39 28 3 4 0.2799
## 4 aaronha01 1955 ML1 602 189 37 9 27 3 1 61 49 3 4 0.3140
## 5 aaronha01 1956 ML1 609 200 34 14 26 2 4 54 37 2 7 0.3284
## OBP SLG
## 1 0.0000 0.0000
## 2 0.0000 0.0000
## 3 0.3221 0.4466
## 4 0.3663 0.5399
## 5 0.3649 0.5583
Let’s find the top 10 season slugging percentages in MLB history:
## Add new columns (BA, OBP, SLG)
Batting %.%
select(playerID, yearID, teamID, AB, H, X2B, X3B, HR, SB, CS, SO, BB, HBP, SF) %.%
filter(AB>0) %.%
mutate(Avg = H/AB,
OBP = (H + BB + HBP)/(AB+BB+HBP+SF),
SLG = (((H-X2B-X3B-HR)+(2*X2B)+(3*X3B)+(4*HR))/AB)) %.%
arrange(desc(SLG)) %.%
head(11)
## playerID yearID teamID AB H X2B X3B HR SB CS SO BB HBP SF Avg OBP SLG
## 1 chacigu01 2010 HOU 1 1 0 0 1 0 0 0 0 0 0 1 1 4
## 2 hernafe02 2008 SEA 1 1 0 0 1 0 0 0 0 0 0 1 1 4
## 3 lefebbi01 1938 BOS 1 1 0 0 1 0 0 0 0 0 NA 1 NA 4
## 4 motagu01 1999 MON 1 1 0 0 1 0 0 0 0 0 0 1 1 4
## 5 narumbu01 1963 BAL 1 1 0 0 1 0 0 0 0 0 0 1 1 4
## 6 perrypa02 1988 CHN 1 1 0 0 1 0 0 0 0 0 0 1 1 4
## 7 quirkja01 1984 CLE 1 1 0 0 1 0 0 0 0 0 0 1 1 4
## 8 rogered01 2005 BAL 1 1 0 0 1 0 2 0 0 0 0 1 1 4
## 9 sleatlo01 1958 DET 1 1 0 0 1 0 0 0 0 0 0 1 1 4
## 10 yanes01 2000 TBA 1 1 0 0 1 0 0 0 0 0 0 1 1 4
## 11 altroni01 1924 WS1 1 1 0 1 0 0 0 0 0 0 NA 1 NA 3
As you can see, there were 10 players who had a homerun in their single at-bat during the season (which yields a slugging percentage of 4.00). Let’s filter these results to only include players with at least 100 at-bats in a season:
## Add new columns (BA, OBP, SLG)
Batting %.%
select(playerID, yearID, teamID, AB, H, X2B, X3B, HR, SB, CS, SO, BB, HBP, SF) %.%
filter(AB>100) %.%
mutate(Avg = H/AB,
OBP = (H + BB + HBP)/(AB+BB+HBP+SF),
SLG = (((H-X2B-X3B-HR)+(2*X2B)+(3*X3B)+(4*HR))/AB)) %.%
arrange(desc(SLG)) %.%
head(10)
## playerID yearID teamID AB H X2B X3B HR SB CS SO BB HBP SF Avg
## 1 bondsba01 2001 SFN 476 156 32 2 73 13 3 93 177 9 2 0.3277
## 2 ruthba01 1920 NYA 457 172 36 9 54 14 14 80 150 3 NA 0.3764
## 3 ruthba01 1921 NYA 540 204 44 16 59 17 13 81 145 4 NA 0.3778
## 4 bondsba01 2004 SFN 373 135 27 3 45 6 1 41 232 9 3 0.3619
## 5 bondsba01 2002 SFN 403 149 31 2 46 9 2 47 198 9 2 0.3697
## 6 ruthba01 1927 NYA 540 192 29 8 60 7 6 89 137 0 NA 0.3556
## 7 gehrilo01 1927 NYA 584 218 52 18 47 10 8 84 109 3 NA 0.3733
## 8 ruthba01 1923 NYA 522 205 45 13 41 17 21 93 170 4 NA 0.3927
## 9 hornsro01 1925 SLN 504 203 41 10 39 5 3 39 83 2 NA 0.4028
## 10 mcgwima01 1998 SLN 509 152 21 0 70 1 0 155 162 6 4 0.2986
## OBP SLG
## 1 0.5151 0.8634
## 2 NA 0.8490
## 3 NA 0.8463
## 4 0.6094 0.8123
## 5 0.5817 0.7990
## 6 NA 0.7722
## 7 NA 0.7654
## 8 NA 0.7644
## 9 NA 0.7560
## 10 0.4699 0.7525
From this, we see Barry Bonds had the highest slugging percentage in 2001 (when he hit 73 home runs). Let’s use a similar set of commands to see which players had the highest number of strikeouts:
## Add new columns (BA, OBP, SLG)
Batting %.%
select(playerID, yearID, teamID, SO) %.%
arrange(desc(SO)) %.%
head(5)
## playerID yearID teamID SO
## 1 reynoma01 2009 ARI 223
## 2 dunnad01 2012 CHA 222
## 3 reynoma01 2010 ARI 211
## 4 stubbdr01 2011 CIN 205
## 5 reynoma01 2008 ARI 204
Mark Reynolds, in 2009, struck-out 223 times. Adam Dunn was a close second, with 222 strikeouts in 2012 for the White Sox.
Who had the most career strikeouts? Let’s see:
## Add new columns (BA, OBP, SLG)
Batting %.%
select(playerID, yearID, teamID, SO) %.%
group_by(playerID) %.%
summarize(totalSO = sum(SO)) %.%
arrange(desc(totalSO)) %.%
head(5)
## totalSO
## 1 NA
Reggie Jackson has struck-out more times (2597 times) than any other player in MLB history. Does he have the highest strike-out rate (strikeouts per at-bat)? Let’s see (for players with at least 2000 at-bats in their careers):
## Add new columns (BA, OBP, SLG)
Batting %.%
select(playerID, yearID, teamID, AB, SO) %.%
group_by(playerID) %.%
summarize(totalSO = sum(SO),
totalAB = sum(AB)) %.%
filter(totalAB>2000) %.%
mutate(Krate = totalSO/totalAB) %.%
arrange(desc(Krate)) %.%
head(5)
## [1] totalSO totalAB Krate
## <0 rows> (or 0-length row.names)
A Google search tells me the leader, striking out nearly 39% of the time, is Jack Cust (who amassed most of his strikeouts in Oakland from 2007-2009).
I wonder who had the highest strikeout rate while playing for the Detroit Tigers…
## Add new columns (BA, OBP, SLG)
Batting %.%
select(playerID, yearID, teamID, AB, SO) %.%
group_by(playerID) %.%
filter(teamID=="DET") %.%
summarize(totalSO = sum(SO),
totalAB = sum(AB)) %.%
filter(totalAB>2000) %.%
mutate(Krate = totalSO/totalAB) %.%
arrange(desc(Krate)) %.%
head(5)
## [1] totalSO totalAB Krate
## <0 rows> (or 0-length row.names)
Brandon Inge. What about the lowest strike-out rate? To do this, I just sort the data in ascending order (deleting the desc command):
## Add new columns (BA, OBP, SLG)
Batting %.%
select(playerID, yearID, teamID, AB, SO) %.%
group_by(playerID) %.%
filter(teamID=="DET") %.%
summarize(totalSO = sum(SO),
totalAB = sum(AB)) %.%
filter(totalAB>2000) %.%
mutate(Krate = totalSO/totalAB) %.%
arrange(Krate) %.%
head(5)
## [1] totalSO totalAB Krate
## <0 rows> (or 0-length row.names)
Google tells me it’s Doc Cramer, who played for Detroit from 1942-1948.
playerInfo("tramm")
## playerID nameFirst nameLast
## 14308 trammal01 Alan Trammell
## 14309 trammbu01 Bubba Trammell
From this, we see there were two Trammells in MLB: Alan and Bubba. I want playerID trammal01. Let’s take a look at his batting statistics each season. Since I already know his playerID and teamID, I’ll eliminate those columns (using a minus - in the select command):
Batting %.%
filter(playerID=="trammal01") %.%
select(-playerID, -teamID, -stint, -lgID, -G_batting, -G_old)
## yearID G AB R H X2B X3B HR RBI SB CS BB SO IBB HBP SH SF GIDP
## 1 1977 19 43 6 8 0 0 0 0 0 0 4 12 0 0 1 0 1
## 2 1978 139 448 49 120 14 6 2 34 3 1 45 56 0 2 6 3 12
## 3 1979 142 460 68 127 11 4 6 50 17 14 43 55 0 0 12 5 6
## 4 1980 146 560 107 168 21 5 9 65 12 12 69 63 2 3 13 7 10
## 5 1981 105 392 52 101 15 3 2 31 10 3 49 31 2 3 16 3 10
## 6 1982 157 489 66 126 34 3 9 57 19 8 52 47 0 0 9 6 5
## 7 1983 142 505 83 161 31 2 14 66 30 10 57 64 2 0 15 4 7
## 8 1984 139 555 85 174 34 5 14 69 19 13 60 63 2 3 6 2 8
## 9 1985 149 605 79 156 21 7 13 57 14 5 50 71 4 2 11 9 6
## 10 1986 151 574 107 159 33 7 21 75 25 12 59 57 4 5 11 4 7
## 11 1987 151 597 109 205 34 3 28 105 21 2 60 47 8 3 2 6 11
## 12 1988 128 466 73 145 24 1 15 69 7 4 46 46 8 4 0 7 14
## 13 1989 121 449 54 109 20 3 5 43 10 2 45 45 1 4 3 5 9
## 14 1990 146 559 71 170 37 1 14 89 12 10 68 55 7 1 3 6 11
## 15 1991 101 375 57 93 20 0 9 55 11 2 37 39 1 3 5 1 7
## 16 1992 29 102 11 28 7 1 1 11 2 2 15 4 0 1 1 1 6
## 17 1993 112 401 72 132 25 3 12 60 12 8 38 38 2 2 4 2 7
## 18 1994 76 292 38 78 17 1 8 28 3 0 16 35 1 1 2 0 8
## 19 1995 74 223 28 60 12 0 2 23 3 1 27 19 4 0 3 2 8
## 20 1996 66 193 16 45 2 0 1 16 6 0 10 27 0 0 1 3 3
From this, we can see he played 20 seasons from 1977-1996. We can list his 5 best seasons in terms of on-base percentage:
## Add new columns (BA, OBP, SLG)
Batting %.%
filter(playerID=="trammal01") %.%
select(-playerID, -teamID, -stint, -lgID, -G_batting, -G_old) %.%
mutate(Avg = H/AB,
OBP = (H + BB + HBP)/(AB+BB+HBP+SF)) %.%
arrange(desc(OBP)) %.%
head(5)
## yearID G AB R H X2B X3B HR RBI SB CS BB SO IBB HBP SH SF GIDP
## 1 1987 151 597 109 205 34 3 28 105 21 2 60 47 8 3 2 6 11
## 2 1993 112 401 72 132 25 3 12 60 12 8 38 38 2 2 4 2 7
## 3 1983 142 505 83 161 31 2 14 66 30 10 57 64 2 0 15 4 7
## 4 1984 139 555 85 174 34 5 14 69 19 13 60 63 2 3 6 2 8
## 5 1990 146 559 71 170 37 1 14 89 12 10 68 55 7 1 3 6 11
## Avg OBP
## 1 0.3434 0.4024
## 2 0.3292 0.3883
## 3 0.3188 0.3852
## 4 0.3135 0.3823
## 5 0.3041 0.3770
We can also get his career totals:
Batting %.%
filter(playerID=="trammal01") %.%
group_by(playerID) %.%
dplyr::summarize(seasons = n(),
games = sum(G),
AB = sum(AB),
hits = sum(H),
dbl = sum(X2B),
trpl = sum(X3B),
HR = sum(HR),
Avg = (sum(H) / sum(AB)),
OBP = ((sum(H)+sum(BB)+sum(HBP))/(sum(AB)+sum(BB)+sum(HBP)+sum(SF))),
SB = sum(SB),
SBpct = (sum(SB)/(sum(CS)+sum(SB))))
## Source: local data frame [1 x 12]
##
## playerID seasons games AB hits dbl trpl HR Avg OBP SB SBpct
## 1 trammal01 20 2293 8288 2365 412 55 185 0.2854 0.3515 236 0.6841
Over his 20 seasons, Alan Trammell batted .285, hit 185 HR, and successfully stole bases 68.4% of the time.
### Minimum 400 games over 5 seasons with at least one SO and HR
### na.rm=TRUE is a command to remove NA values from each variable
hrSO <- Batting %.%
group_by(playerID) %.%
dplyr::summarize(seasons = n(),
games = sum(G, na.rm=TRUE),
atbats = sum(AB, na.rm=TRUE),
HR = sum(HR, na.rm=TRUE),
Avg = (sum(H, na.rm=TRUE) / sum(AB, na.rm=TRUE)),
SO = sum(SO, na.rm=TRUE)) %.%
filter(seasons>5 & games>400 & atbats>1000 & SO>0 & HR>0)
### Create plot
ggplot(hrSO, aes(HR, SO)) +
geom_point(alpha = 1/2) +
geom_smooth() +
scale_size_area()
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
This plot shows the general positive relationship between strikeouts and homeruns. This could be simply due to some players having more at-bats than others, so let’s plot the relationship between “home runs per at-bat” and “strikeouts per at-bat”
### Minimum 400 games over 5 seasons with at least one SO and HR
### na.rm=TRUE is a command to remove NA values from each variable
hrSO2 <- Batting %.%
group_by(playerID) %.%
dplyr::summarize(seasons = n(),
games = sum(G, na.rm=TRUE),
atbats = sum(AB, na.rm=TRUE),
HR = sum(HR, na.rm=TRUE),
Avg = (sum(H, na.rm=TRUE) / sum(AB, na.rm=TRUE)),
SO = sum(SO, na.rm=TRUE),
HRperAB = (HR/atbats),
SOperAB = (SO/atbats)) %.%
filter(seasons>5 & games>400 & atbats>1000 & SO>0 & HR>0)
### Create plot
ggplot(hrSO2, aes(HRperAB, SOperAB)) +
geom_point(alpha = 1/2) +
geom_smooth() +
scale_size_area()
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
I wonder if this same relationship holds for pitchers. Do pitchers with more career strikeouts give up more career homeruns? Let’s see (controlling for innings pitched)…
### Minimum 3000 innings pitched
### na.rm=TRUE is a command to remove NA values from each variable
PitcherHRso <- Pitching %.%
group_by(playerID) %.%
dplyr::summarize(seasons = n(),
IPouts = sum(IPouts, na.rm=TRUE),
HR = sum(HR, na.rm=TRUE),
SO = sum(SO, na.rm=TRUE),
HRperIP = (HR/IPouts),
SOperIP = (SO/IPouts)) %.%
filter(IPouts > 3000)
### Create plot
ggplot(PitcherHRso, aes(HRperIP, SOperIP)) +
geom_point(aes(size=IPouts), alpha = .4) +
geom_smooth() +
scale_size_area()
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
The size of the dots indicate the number of innings pitched for each pitcher. This relationship is a bit more complicated than it was for batters.
See if you can figure out what this plot shows:
TeamSummary <- Batting %.%
group_by(teamID) %.%
dplyr::summarize(games = sum(G, na.rm=TRUE),
atbats = sum(AB, na.rm=TRUE),
HR = sum(HR, na.rm=TRUE),
Avg = (sum(H, na.rm=TRUE) / sum(AB, na.rm=TRUE)),
SO = sum(SO, na.rm=TRUE),
HRperGame = (HR/games),
SOperGame = (SO/games)) %.%
filter(HRperGame>0 & SOperGame>0)
ggplot(TeamSummary, aes(HRperGame, SOperGame)) +
geom_point(aes(size=Avg), alpha = 1/2) +
geom_smooth() +
scale_size_area()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.