Hw1

This is the organization and presentation of the UN_2011_Population_Cities_Over_750K.xlsx data set.

I started my process by making transforming the data so that it could be more easily presented as a whole.

setwd("/Users/JonathanCampbell/Desktop/Columbia/Data Viz/QMSSViz/Lab1")
library("XLConnect")

## Loading required package: XLConnectJars
## XLConnect 0.2-9 by Mirai Solutions GmbH [aut],
##   Martin Studer [cre],
##   The Apache Software Foundation [ctb, cph] (Apache POI, Apache Commons
##     Codec, XML Commons External Components XML APIs),
##   Stephen Colebourne [ctb, cph] (Joda-Time Java library),
##   Metastuff, Ltd. [ctb, cph] (dom4j)
## http://www.mirai-solutions.com ,
## http://miraisolutions.wordpress.com

wb <- loadWorkbook("UN_2011_Population_Cities_Over_750K.xlsx", create = FALSE)
# Show a summary of the workbook (shows worksheets,
# defined names, hidden sheets, active sheet name, ...)
summary(wb)

## *** XLConnect Workbook Summary ***
## > Filename: 'UN_2011_Population_Cities_Over_750K.xlsx'
## > Sheets (all):
## CITIES-OVER-750K, NOTES, 5yr, Decadal
## > Hidden Sheets:
## <NONE>
## > Very Hidden Sheets:
## <NONE>
## > Names:
## _xlnm._FilterDatabase, _xlnm.Print_Area, table
## > Active Sheet:  CITIES-OVER-750K

# Read data from a worksheet interpreting the first row as column names
    df1 <- readWorksheet(wb, sheet = "CITIES-OVER-750K", startRow=13, header=TRUE)
    # Read data from a named region/range interpreting the first row as column
    # names
    ## df2 <- readNamedRegion(wb, name = "myname", header = TRUE)
colnames(df1)<- c("Country.Code", "Country", "City.Code", "Urban.Agglomeration", 
                  "Note", "Latitude", "Longitude", "pop50", "pop55", "pop60", 
                  "pop65", "pop70", "pop75", "pop80", "pop85", "pop90", "pop95", 
                  "pop00", "pop05", "pop10", "pop15", "pop20", "pop25")
#Rename columns to reflect the dates of the population survey

country <- unique(df1$Country, incomparables=FALSE)
# create vector of unique countries within sheet
countries <- data.frame(table(df1$Country))
# frequecy table of unique countries as data frame for plot 
  #need to add average population to dataframe for graphing
df2 <- data.frame(df1$Country, df1$pop10, df1$pop15, df1$pop20)
library("plyr")
# attempt to rename columns for data.frame 
sum10<-aggregate(.~df1.Country, data=df2, sum)
# create data.frame of Countries by totaling population across large cities
mean10<-aggregate(.~df1.Country, data=df2, mean)
# create data.frame of Countries by mean population across large cities
max10<-aggregate(.~df1.Country, data=df2, FUN=max)
# create data.frame of Countries by largest city population
dfm <- merge(countries, mean10, by.x="Var1", by.y="df1.Country")
dfm1<- merge(dfm, sum10, by.x="Var1", by.y="df1.Country")
dfm2<- merge(dfm1, max10, by.x="Var1", by.y="df1.Country")
#merging tables into one
colnames(dfm2) <- c("Country", "Freq", "mean10", "mean15", "mean20", 
                    "sum10", "sum15", "sum20", "max10", "max15", "max20")
# Renaming columns in merged data.frame

meansort <- subset(dfm2, select = c(Country, Freq, mean10, mean15, mean20))[order(-dfm2$mean10),]
sumsort <- subset(dfm2, select = c(Country, sum10, sum15, sum20))[order(-dfm2$sum10),]
maxsort <- subset(dfm2, select = c(Country, max10, max15, max20))[order(-dfm2$max10),]
#create sorted dataframs in order to parse down via intersection

dfintersect <- merge(meansort[1:25,], sumsort[1:25,], by.x="Country", by.y="Country")
#intersection of top 25 of both categories - 14 hits
dfintersect2 <- merge(dfintersect, maxsort[1:25,], by.x="Country", by.y="Country")
#intersection of top 25 in all 3 categories - 13 hits

maxsort1 <- maxsort[1:20,]
#Top 20 list from maximum city size for countries

library(ggplot2)

#You can also embed plots, for example:

ggplot(dfintersect2, aes(x=Freq, y=mean10,label=Country)) +
  xlab("Number of Cities") + 
  ylab("Average Population") + 
  ggtitle("City Population Distribution")+
  geom_text(aes(label=Country, size=1.8),
            position=position_jitter(width=-100, height=-200)) +
  geom_abline(intercept=1500, slope=110, colour="green", size=1, alpha = I(0.3)) +
  ylim(1500, 7950) + xlim(0, 60) + 
  theme_bw() + theme(legend.position="none")

plot of chunk unnamed-chunk-2

##Avg Population vs Frequency of Cities
# Shows the distribution of select countires and how they're average large city population weights against the frequency of large cities in that country

ggplot(dfintersect2, aes(x=Freq, y=mean10,label=Country)) +
  xlab("Number of Cities") + 
  ylab("Mean Population") + 
  ggtitle("City Population Disparity") +
  geom_point(aes(size = max10, colour = Country)) +
  scale_size_area()

plot of chunk unnamed-chunk-3

##Scatterplot of City Population disparities
#Similar to the plot above, only this plot weighs the size of the largest city in that country to the dot size.

ggplot(maxsort1, aes(x=Country, y=max10)) +
  ggtitle("Countries with the Largest Cities") +
  ylab("City Population") +
  geom_bar(stat="identity", aes(fill=Country)) + coord_flip() +
  theme_bw() + theme(legend.position="none")

plot of chunk unnamed-chunk-4

##Bar chart of largest city by country
#Shows the City populations of the largest cities in select countries.

ggplot(df1, aes(x=Longitude, y=Latitude,label=Urban.Agglomeration)) +
  geom_point(shape=1, size=0.3) +
  xlab("Longitude") + 
  ylab("Latitude") + 
  ggtitle("World Population groupings")+
  geom_point(aes(size = pop10, colour = pop10))+
  scale_size_area()+
  scale_colour_gradient(low = "red", high = "yellow")

plot of chunk unnamed-chunk-5

##World population distribution of large cities
#Uses latitude and Longitude coordinates to plot large cities across the world on the geographic grid.  The graph uses scaled dot size and color gradiation in order to represent the population sizes of the cities across the world