This is the organization and presentation of the UN_2011_Population_Cities_Over_750K.xlsx data set.
I started my process by making transforming the data so that it could be more easily presented as a whole.
setwd("/Users/JonathanCampbell/Desktop/Columbia/Data Viz/QMSSViz/Lab1")
library("XLConnect")
## Loading required package: XLConnectJars
## XLConnect 0.2-9 by Mirai Solutions GmbH [aut],
## Martin Studer [cre],
## The Apache Software Foundation [ctb, cph] (Apache POI, Apache Commons
## Codec, XML Commons External Components XML APIs),
## Stephen Colebourne [ctb, cph] (Joda-Time Java library),
## Metastuff, Ltd. [ctb, cph] (dom4j)
## http://www.mirai-solutions.com ,
## http://miraisolutions.wordpress.com
wb <- loadWorkbook("UN_2011_Population_Cities_Over_750K.xlsx", create = FALSE)
# Show a summary of the workbook (shows worksheets,
# defined names, hidden sheets, active sheet name, ...)
summary(wb)
## *** XLConnect Workbook Summary ***
## > Filename: 'UN_2011_Population_Cities_Over_750K.xlsx'
## > Sheets (all):
## CITIES-OVER-750K, NOTES, 5yr, Decadal
## > Hidden Sheets:
## <NONE>
## > Very Hidden Sheets:
## <NONE>
## > Names:
## _xlnm._FilterDatabase, _xlnm.Print_Area, table
## > Active Sheet: CITIES-OVER-750K
# Read data from a worksheet interpreting the first row as column names
df1 <- readWorksheet(wb, sheet = "CITIES-OVER-750K", startRow=13, header=TRUE)
# Read data from a named region/range interpreting the first row as column
# names
## df2 <- readNamedRegion(wb, name = "myname", header = TRUE)
colnames(df1)<- c("Country.Code", "Country", "City.Code", "Urban.Agglomeration",
"Note", "Latitude", "Longitude", "pop50", "pop55", "pop60",
"pop65", "pop70", "pop75", "pop80", "pop85", "pop90", "pop95",
"pop00", "pop05", "pop10", "pop15", "pop20", "pop25")
#Rename columns to reflect the dates of the population survey
country <- unique(df1$Country, incomparables=FALSE)
# create vector of unique countries within sheet
countries <- data.frame(table(df1$Country))
# frequecy table of unique countries as data frame for plot
#need to add average population to dataframe for graphing
df2 <- data.frame(df1$Country, df1$pop10, df1$pop15, df1$pop20)
library("plyr")
# attempt to rename columns for data.frame
sum10<-aggregate(.~df1.Country, data=df2, sum)
# create data.frame of Countries by totaling population across large cities
mean10<-aggregate(.~df1.Country, data=df2, mean)
# create data.frame of Countries by mean population across large cities
max10<-aggregate(.~df1.Country, data=df2, FUN=max)
# create data.frame of Countries by largest city population
dfm <- merge(countries, mean10, by.x="Var1", by.y="df1.Country")
dfm1<- merge(dfm, sum10, by.x="Var1", by.y="df1.Country")
dfm2<- merge(dfm1, max10, by.x="Var1", by.y="df1.Country")
#merging tables into one
colnames(dfm2) <- c("Country", "Freq", "mean10", "mean15", "mean20",
"sum10", "sum15", "sum20", "max10", "max15", "max20")
# Renaming columns in merged data.frame
meansort <- subset(dfm2, select = c(Country, Freq, mean10, mean15, mean20))[order(-dfm2$mean10),]
sumsort <- subset(dfm2, select = c(Country, sum10, sum15, sum20))[order(-dfm2$sum10),]
maxsort <- subset(dfm2, select = c(Country, max10, max15, max20))[order(-dfm2$max10),]
#create sorted dataframs in order to parse down via intersection
dfintersect <- merge(meansort[1:25,], sumsort[1:25,], by.x="Country", by.y="Country")
#intersection of top 25 of both categories - 14 hits
dfintersect2 <- merge(dfintersect, maxsort[1:25,], by.x="Country", by.y="Country")
#intersection of top 25 in all 3 categories - 13 hits
maxsort1 <- maxsort[1:20,]
#Top 20 list from maximum city size for countries
library(ggplot2)
#You can also embed plots, for example:
ggplot(dfintersect2, aes(x=Freq, y=mean10,label=Country)) +
xlab("Number of Cities") +
ylab("Average Population") +
ggtitle("City Population Distribution")+
geom_text(aes(label=Country, size=1.8),
position=position_jitter(width=-100, height=-200)) +
geom_abline(intercept=1500, slope=110, colour="green", size=1, alpha = I(0.3)) +
ylim(1500, 7950) + xlim(0, 60) +
theme_bw() + theme(legend.position="none")
##Avg Population vs Frequency of Cities
# Shows the distribution of select countires and how they're average large city population weights against the frequency of large cities in that country
ggplot(dfintersect2, aes(x=Freq, y=mean10,label=Country)) +
xlab("Number of Cities") +
ylab("Mean Population") +
ggtitle("City Population Disparity") +
geom_point(aes(size = max10, colour = Country)) +
scale_size_area()
##Scatterplot of City Population disparities
#Similar to the plot above, only this plot weighs the size of the largest city in that country to the dot size.
ggplot(maxsort1, aes(x=Country, y=max10)) +
ggtitle("Countries with the Largest Cities") +
ylab("City Population") +
geom_bar(stat="identity", aes(fill=Country)) + coord_flip() +
theme_bw() + theme(legend.position="none")
##Bar chart of largest city by country
#Shows the City populations of the largest cities in select countries.
ggplot(df1, aes(x=Longitude, y=Latitude,label=Urban.Agglomeration)) +
geom_point(shape=1, size=0.3) +
xlab("Longitude") +
ylab("Latitude") +
ggtitle("World Population groupings")+
geom_point(aes(size = pop10, colour = pop10))+
scale_size_area()+
scale_colour_gradient(low = "red", high = "yellow")
##World population distribution of large cities
#Uses latitude and Longitude coordinates to plot large cities across the world on the geographic grid. The graph uses scaled dot size and color gradiation in order to represent the population sizes of the cities across the world