mapping oil production by country


#==============
# LOAD PACKAGES
#==============

library(tidyverse)
library(sf)
library(rvest)
library(stringr)
library(scales)
#library(viridis)



#============
# SCRAPE DATA
#============

df.oil <- read_html("https://en.wikipedia.org/wiki/List_of_countries_by_oil_production") %>%
  html_nodes("table") %>%
  .[[1]] %>%
  html_table()



#====================
# CHANGE COLUMN NAMES 
#====================

colnames(df.oil) <- c('rank', 'country', 'oil_bbl_per_day')



#=============================
# WRANGLE VARIABLES INTO SHAPE
#=============================


#----------------------------------
# COERCE 'rank' VARIABLE TO INTEGER
#----------------------------------

df.oil <- df.oil %>% mutate(rank = as.integer(rank)) 

df.oil %>% glimpse()



#---------------------------------------------------
# WRANGLE FROM CHARACTER TO NUMERIC: oil_bbl_per_day
#---------------------------------------------------

df.oil <- df.oil %>% mutate(oil_bbl_per_day = oil_bbl_per_day %>% str_replace_all(',','') %>% as.integer())


# inspect
df.oil %>% glimpse()


#===========================
#CREATE VARIABLE: 'opec_ind'
#===========================

df.oil <- df.oil %>% mutate(opec_ind = if_else(str_detect(country, 'OPEC'), 1, 0))


#=========================================================
# CLEAN UP 'country'
# - some country names are tagged as being OPEC countries
#   and this information is in the country name
# - we will strip this information out
#=========================================================

df.oil <- df.oil %>% mutate(country = country %>% str_replace(' \\(OPEC\\)', '') %>% str_replace('\\s{2,}',' '))


# inspect
df.oil %>% glimpse()


#------------------------------------------
# EXAMINE OPEC COUNTRIES
# - here, we'll just visually inspect
#   to make sure that the names are correct
#------------------------------------------

df.oil %>% 
  filter(opec_ind == 1) %>%
  select(country)



#==================
# REORDER VARIABLES
#==================

df.oil <- df.oil %>% select(rank, country, opec_ind, oil_bbl_per_day)

df.oil %>% glimpse()



#========
# GET MAP
#========

map.world <- map_data('world')

df.oil


#==========================
# CHECK FOR JOIN MISMATCHES
#==========================

anti_join(df.oil, map.world, by = c('country' = 'region'))


# rank                           country opec_ind oil_bbl_per_day
# 1   67 Congo, Democratic Republic of the        0          20,000
# 2   47               Trinidad and Tobago        0          60,090
# 3   34            Sudan and  South Sudan        0         255,000
# 4   30            Congo, Republic of the        0         308,363
# 5   20                    United Kingdom        0         939,760
# 6    3                     United States        0       8,875,817



#=====================
# RECODE COUNTRY NAMES
#=====================

map.world %>%
  group_by(region) %>%
  summarise() %>%
  print(n = Inf)


# UK
# USA
# Democratic Republic of the Congo
# Trinidad
# Sudan
# South Sudan


df.oil <- df.oil %>%  mutate(country = recode(country, `United States` = 'USA'
                                 , `United Kingdom` = 'UK'
                                 , `Congo, Democratic Republic of the` = 'Democratic Republic of the Congo'
                                 , `Trinidad and Tobago` = 'Trinidad'
                                 , `Sudan and South Sudan` = 'Sudan'
                                 #, `Sudan and  South Sudan` = 'South Sudan'
                                 , `Congo, Republic of the` = 'Republic of Congo'
                                 )
                   )



#-----------------------
# JOIN DATASETS TOGETHER
#-----------------------

map.oil <- left_join( map.world, df.oil, by = c('region' = 'country')) 



#=====
# PLOT
#=====

# BASIC (this is a first draft)

ggplot(map.oil, aes( x = long, y = lat, group = group )) +
  geom_polygon(aes(fill = oil_bbl_per_day))



#=======================
# FINAL, FORMATTED DRAFT
#=======================


df.oil %>% filter(oil_bbl_per_day > 822675) %>% summarise(mean(oil_bbl_per_day))
# 3190373

df.oil %>% filter(oil_bbl_per_day < 822675) %>% summarise(mean(oil_bbl_per_day))
# 96581.08



ggplot(map.oil, aes( x = long, y = lat, group = group )) +
  geom_polygon(aes(fill = oil_bbl_per_day)) +
  scale_fill_gradientn(colours = c('#461863','#404E88','#2A8A8C','#7FD157','#F9E53F')
                       ,values = scales::rescale(c(100,96581,822675,3190373,10000000))
                       ,labels = comma
                       ,breaks = c(100,96581,822675,3190373,10000000)
                       ) +
  guides(fill = guide_legend(reverse = T)) +
  labs(fill = 'bbl/day'
       ,title = 'Oil Production by Country'
       ,subtitle = 'Barrels per day, 2016'
       ,x = NULL
       ,y = NULL) +
  theme(text = element_text(family = 'Gill Sans', color = '#EEEEEE')
        ,plot.title = element_text(size = 28)
        ,plot.subtitle = element_text(size = 14)
        ,axis.ticks = element_blank()
        ,axis.text = element_blank()
        ,panel.grid = element_blank()
        ,panel.background = element_rect(fill = '#333333')
        ,plot.background = element_rect(fill = '#333333')
        ,legend.position = c(.18,.36)
        ,legend.background = element_blank()
        ,legend.key = element_blank()
        ) +
  annotate(geom = 'text'
           ,label = 'Source: U.S. Energy Information Administration\nhttps://en.wikipedia.org/wiki/List_of_countries_by_oil_production'
           ,x = 18, y = -55
           ,size = 3
           ,family = 'Gill Sans'
           ,color = '#CCCCCC'
           ,hjust = 'left'
           )
And here’s the finalized map that the code produces: