China Vs US Vs Middle East - Who smokes more?
Nothing helps like practice. I'm fiddling around with machine learning and data visualization libraries to become proficient in them. So I wanted to try my hand in coaxing information from data. Thats when I came across CO2 emission index of countries in Quandl, one of the excellent sources of open data. So I thought how would it be to take other attributes of the countries and find a relation between those attributes and CO2 emission index.
Below is the code for scraping the data from Quandl. We are scraping the data because the data we are interested is from the summary page and not individual data source per se. At this point, I'm obligated to mention that quandl provides ability to download data as csv for any timeframe range and your choice of frequency. The same can also be obtained through their excellent python library.
import sys import os from lxml import html import requests import csv
def main(url, outfile): dirname = os.path.dirname(outfile) if dirname and not os.path.exists(dirname): os.makedirs(dirname) source = requests.get(url) if not source.status_code == 200: print “Got status code”, source.status_code, “Exiting” sys.exit(1)
doc = html.fromstring(source.text) records = [(‘country’, ‘value’, ‘unit’, ‘year’)] table = doc.xpath(‘//div[@class=“datatable”]’)[-1] #The last table contains World data rows = table.xpath(“.//tr”) print len(rows) for row in rows: try: tds = row.xpath(‘td’) if not tds: continue country = str((tds[0].xpath(‘a/@data-sort-value’) + tds[0].xpath(‘span/@data-sort-value’) )[0]) value = str(tds[1].xpath(‘span/@data-sort-value’)[0]) if value: value = float(value) unit = tds[2].text year = str(tds[3].xpath(‘span/@data-sort-value’)[0]) if year: year = int(year) except: import traceback print traceback.format_exception(*sys.exc_info()) print html.tostring(row) else: records.append((country, value, unit, year)) csv.writer(open(outfile, ‘w’)).writerows(records)
if name == “main”: main(sys.argv[1], sys.argv[2])
The above code can be used to download tabular data for 'All Countries' from quandl. The data we are interested in are
* GDP - All Countries
* Fuelwood Production - All Countries
* Urban Population - All Countries
* Gini Index - All Countries
* Passenger Vehicle - All Countries
* Cellphone Subscription - All Countries
* GDP -PPP - All Countries
* Population - All Countries
And there is the main data that we are going to work on : "CO2 Emissions - All Countries"
We are going to find which of the above data best predicts the CO2 emission a country would have.
import matplotlib as plot import numpy as np import pandas as pd from mpltools import style
style.use(‘ggplot’)
from pprint import pprint
from glob import glob
csvs = glob('*.csv')
pprint(csvs)
['cellphone.csv',
'urbanpop.csv',
'population.csv',
'gdp.csv',
'gdp_ppp.csv',
'fuelwood.csv',
'co2_emissions.csv',
'vehicles.csv',
'gini.csv']
First lets read in the CO2 emissions for each country and calculate CO2 emission per 1000 people. Then let us bring the rest of the params to the same denomination of 1000 people where possible.
co2_em = pd.read_csv('co2_emissions.csv', index_col=0) co2_em[:5] population = pd.read_csv('population.csv', index_col=0) population[:5] data = pd.DataFrame(co2_em['value']) data['pop'] = population['value'] * 1000 #Population is in millions. We are converting it to 1000s data['co2_1000'] = data['value']/data['pop'] data[:5]
value | pop | co2_1000 | |
---|---|---|---|
country | |||
afghanistan | 6314.574 | 32017 | 0.197226 |
albania | 3006.940 | 3243 | 0.927209 |
algeria | 121311.694 | 36494 | 3.324154 |
angola | 26655.423 | 20213 | 1.318727 |
antigua and barbuda | 462.042 | 88 | 5.250477 |
cellphone_usage = pd.read_csv('cellphone.csv', index_col=0) #cellphone connections per 100 persons data['cellphone'] = cellphone_usage['value'] * 10 #converting it to per 1000 persons urbanpop = pd.read_csv('urbanpop.csv', index_col=0) #urban pop in percentage data['urbanpop'] = urbanpop['value'] gdp = pd.read_csv('gdp.csv', index_col=0) #gdp in billions data['gdp'] = gdp['value'] * 1000000000/(data['pop']*1000) #gdp per person fuelwood = pd.read_csv('fuelwood.csv', index_col=0)#kiloTonnes data['fuelwood'] = fuelwood['value']* 1000/data['pop'] #fuelwood in tonnes per person gini = pd.read_csv('gini.csv', index_col=0) data['gini'] = gini['value'] vehicles = pd.read_csv('vehicles.csv', index_col=0) # vehicles per thousand people data['vehicles'] = vehicles['value'] gdp_ppp = pd.read_csv('gdp_ppp.csv', index_col=0) data['gdp_ppp'] = gdp_ppp['value'] data[:10]
value | pop | co2_1000 | cellphone | urbanpop | gdp | fuelwood | gini | vehicles | gdp_ppp | |
---|---|---|---|---|---|---|---|---|---|---|
country | ||||||||||
afghanistan | 6314.574 | 32017 | 0.197226 | 538.969630 | 23.8554 | 621.732205 | 49.896899 | 27.82 | 20.112597 | 1053.813 |
albania | 3006.940 | 3243 | 0.927209 | 1084.473347 | 54.4472 | 3912.426765 | 301.031517 | 34.51 | 91.979675 | 8052.178 |
algeria | 121311.694 | 36494 | 3.324154 | 1033.058644 | 73.7060 | 5693.922289 | 7.184152 | 35.33 | 75.872877 | 7477.069 |
angola | 26655.423 | 20213 | 1.318727 | 486.100024 | 59.9062 | 5873.398308 | 982.806239 | 42.66 | 8.000000 | 6346.742 |
antigua and barbuda | 462.042 | 88 | 5.250477 | 1986.178323 | 29.8672 | 13363.636364 | NaN | NaN | 153.276256 | 18026.662 |
argentina | 174717.882 | 41028 | 4.258504 | 1425.117584 | 92.6406 | 11576.338111 | 113.385980 | 44.49 | NaN | 18112.330 |
armenia | 4492.075 | 3366 | 1.334544 | 1068.789619 | 64.1636 | 2990.493167 | 11.883541 | 31.30 | 94.000000 | 5838.258 |
australia | 400194.378 | 22766 | 17.578599 | 1061.928152 | 89.3372 | 67723.666872 | 471.270955 | 35.19 | 556.185481 | 42640.275 |
austria | 62313.331 | 8466 | 7.360422 | 1612.069881 | 67.8808 | 47081.738720 | 733.052303 | 29.15 | 529.338286 | 42408.575 |
azerbaijan | 49075.461 | 9235 | 5.314073 | 1074.721324 | 53.8886 | 7450.351922 | 32.658365 | 33.71 | 83.849698 | 10478.231 |
Panda's Dataframe provides a convenient function 'corr' that provides the
correlation between each and every column of the dataframe.
data.corr()
value | pop | co2_1000 | cellphone | urbanpop | gdp | fuelwood | gini | vehicles | gdp_ppp | |
---|---|---|---|---|---|---|---|---|---|---|
value | 1.000000 | 0.795840 | 0.156284 | 0.017960 | 0.100557 | 0.101115 | -0.096348 | -0.032174 | 0.070360 | 0.116638 |
pop | 0.795840 | 1.000000 | -0.013529 | -0.078453 | -0.039648 | -0.046795 | -0.061197 | -0.043269 | -0.093295 | -0.051414 |
co2_1000 | 0.156284 | -0.013529 | 1.000000 | 0.459959 | 0.483094 | 0.699502 | -0.098430 | -0.228570 | 0.639473 | 0.764942 |
cellphone | 0.017960 | -0.078453 | 0.459959 | 1.000000 | 0.537769 | 0.401796 | -0.071408 | -0.092684 | 0.432711 | 0.505035 |
urbanpop | 0.100557 | -0.039648 | 0.483094 | 0.537769 | 1.000000 | 0.607351 | -0.123386 | -0.110808 | 0.625057 | 0.670027 |
gdp | 0.101115 | -0.046795 | 0.699502 | 0.401796 | 0.607351 | 1.000000 | -0.043769 | -0.314087 | 0.761361 | 0.945029 |
fuelwood | -0.096348 | -0.061197 | -0.098430 | -0.071408 | -0.123386 | -0.043769 | 1.000000 | 0.120487 | -0.169399 | -0.054676 |
gini | -0.032174 | -0.043269 | -0.228570 | -0.092684 | -0.110808 | -0.314087 | 0.120487 | 1.000000 | -0.423783 | -0.275202 |
vehicles | 0.070360 | -0.093295 | 0.639473 | 0.432711 | 0.625057 | 0.761361 | -0.169399 | -0.423783 | 1.000000 | 0.776197 |
gdp_ppp | 0.116638 | -0.051414 | 0.764942 | 0.505035 | 0.670027 | 0.945029 | -0.054676 | -0.275202 | 0.776197 | 1.000000 |
As we can see from the correlation matrix, plain CO2 emission levels are heavily correlated with population and almost unrelated to the rest. When we discount the population factor by normalizing to get CO2 emission per 1000 people, we find that it somewhat correlated with cellphone connections per 1000 person and urban population percentage. It is well correlated with gdp, vehicles per 1000and gpd_ppp. Now lets try plotting it.
import matplotlib.pyplot as plt fig = plt.figure(tight_layout=True) fields = ['cellphone', 'urbanpop', 'gdp', 'fuelwood', 'gini', 'vehicles', 'gdp_ppp'] fig.set_size_inches(12,12) for index, field in enumerate(fields): ax = fig.add_subplot(3, 3, index+1, xlabel=field, ylabel='CO<sub>2</sub> emissions per 1000 people') ax.scatter(data[field], data['co2_1000'])
fig.show()
[caption id="attachment_86" align="aligncenter" width="856"] Factors that correlate the most with CO2 emmisions.[/caption]
Now lets us remove the records which have NaN as value for CO2 emmisions
clean_data = data[np.isfinite(data['co2_1000'])] y = clean_data['co2_1000'].values x = clean_data[['cellphone', 'urbanpop', 'gdp', 'fuelwood', 'gini', 'vehicles', 'gdp_ppp']].values.transpose() x.shape
(7, 178)
Now lets see which countries have the highest carbon emission per 1000 people.
plt.cool() sorted_data = clean_data.sort(columns=['co2_1000'], ascending=False) top50_data = sorted_data[:40] labels = top50_data.index co2_1000 = top50_data['co2_1000'].values figure(figsize=(10,10)) colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral'] colors = ['seashell','pink','plum', 'PapayaWhip','violet', 'PeachPuff','MediumOrchid'] plt.pie(co2_1000, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90) # Set aspect ratio to be equal so that pie is drawn as a circle. plt.axis('equal') plt.show()
It is interesting to see that the countries that contribute the highest carbon footprint is from middle east followed by the developed countries. So which contribute the least? As expected, countries having the smallest carbon footprint are mostly least developed.
', '.join([x.title() for x in sorted_data[-30:].index])
'Kenya, Laos, Ivory Coast, Togo, Sierra Leone, Haiti, Afghanistan, Guinea-Bissau, Comoros, Myanmar, Timor-Leste, Tanzania, Zambia, Liberia, Mozambique, Nepal, Guinea, Uganda, Burkina Faso, Ethiopia, Eritrea, Madagascar, Niger, Rwanda, Malawi, Central African Republic, Chad, Mali, Congo, Burundi'
Lets make a map of countries and their carbon footprint. We need to map the countries to their 3-letter abbreviations.
Let as look at the distribution of emission amoung the countries. As it is apparent, there is assymmetry in how the carbon emission is distributed.
plt.hist(sorted_data['co2_1000'], color=['green'], bins=15)
(array([91, 23, 29, 18, 4, 4, 2, 1, 2, 1, 0, 1, 0, 0, 2]),
array([ 2.17303704e-02, 2.57036546e+00, 5.11900054e+00,
7.66763563e+00, 1.02162707e+01, 1.27649058e+01,
1.53135409e+01, 1.78621760e+01, 2.04108111e+01,
2.29594461e+01, 2.55080812e+01, 2.80567163e+01,
3.06053514e+01, 3.31539865e+01, 3.57026216e+01,
3.82512567e+01]),
<a>)</a>
[caption id="attachment_88" align="aligncenter" width="380"] Biggest Absolute contributors of CO2 emissions[/caption]
This begs the question, who are the biggest absolute contributors?
carbon_absolute = clean_data.sort(columns=['value'], ascending=False) total_emission = sum(carbon_absolute['value']) print 'Total Global Emission is:', total_emission carbon_absolute['percentage'] = 100 * carbon_absolute['value']/total_emission print carbon_absolute['percentage'][:10]
Total Global Emission is: 29948374.332
country
china 25.667883
usa 17.695662
india 6.609456
russia 5.257000
japan 3.676774
germany 2.452885
iran 2.010311
canada 1.716078
south korea 1.700846
south africa 1.666255
Name: percentage, dtype: float64
plt.hist(carbon_absolute['percentage'], color=['green'], bins=15)
(array([170, 3, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 1]),
array([ 1.71421659e-04, 1.71135221e+00, 3.42253300e+00,
5.13371379e+00, 6.84489458e+00, 8.55607537e+00,
1.02672562e+01, 1.19784370e+01, 1.36896177e+01,
1.54007985e+01, 1.71119793e+01, 1.88231601e+01,
2.05343409e+01, 2.22455217e+01, 2.39567025e+01,
2.56678833e+01]),
<a>)</a>
It is interesting to see that even though the Middle East have the largest per capita contribution, they are insignificant compared to the giants, China, USA and India because of the sheer population size of the latter. China, USA and India together contribute nearly 50% of the world's carbon emissions. Lets generate a map of the countries of the world by net emissions and by per 1000 people footprint. The code used for generating the required javascript are below the map.
Total Emission Map
Emission per 1000 People Map
def generate_total_emission_map():
def get_fill(carb_em, index): if carb_em >= 10: return ‘VERYHIGH’ elif carb_em >= 2: return ‘HIGH’ elif carb_em >= 1: return ‘MEDIUM’ else: return ‘LOW’
fills = {‘VERYHIGH’:‘#FF003C’, ‘HIGH’:‘#FF8A00’, ‘MEDIUM’:‘#FABE28’, ‘LOW’:‘#88C100’, ‘VERYLOW’:‘#00C176’, ‘defaultFill’:‘#87796F’} matched_abbr = dict([x for x in csv.reader(open(‘abbr.csv’, ‘r’))]) map_data = defaultdict(lambda: defaultdict(str)) columns = carbon_absolute.columns
for index, item in enumerate(carbon_absolute.index): abbr = matched_abbr[item] map_data[abbr][‘rank’] = index + 1 map_data[abbr][‘fillKey’] = get_fill(carbon_absolute.ix[item][‘percentage’],item) map_data[abbr][‘cname’] = item map_data[abbr][‘population’] = carbon_absolute.ix[item][‘pop’]/1000 for col in columns: map_data[abbr][col] = “%2.2f” %carbon_absolute.ix[item][col]
return map_data, fills
#ratio = /max_carbon #fills[abbr] = ratio_to_color(ratio)
import json map_data, fills = generate_total_emission_map() jscode = open(‘javascript_template.js’).read().replace(‘#FILLS#’, json.dumps(fills))
.replace(‘#DATA#’, json.dumps(map_data))
.replace(‘#MAPNAME#’, ‘total_emission’)open(‘total_emission.js’, ‘w’).write(jscode)
def generate_co2_1000_map():
def get_fill(carb_em, index): if carb_em >= 20: print ‘veryhigh’, index return ‘VERYHIGH’ elif carb_em >= 10: return ‘HIGH’ elif carb_em >= 5: return ‘MEDIUM’ elif carb_em >= 2: return ‘LOW’ else: return ‘VERYLOW’
fills = {‘VERYHIGH’:‘#FF003C’, ‘HIGH’:‘#FF8A00’, ‘MEDIUM’:‘#FABE28’, ‘LOW’:‘#88C100’, ‘VERYLOW’:‘#00C176’, ‘defaultFill’:‘#87796F’} matched_abbr = dict([x for x in csv.reader(open(‘abbr.csv’, ‘r’))]) map_data = defaultdict(lambda: defaultdict(str)) columns = carbon_absolute.columns
for index, item in enumerate(carbon_absolute.index): abbr = matched_abbr[item] map_data[abbr][‘rank’] = index + 1 map_data[abbr][‘fillKey’] = get_fill(carbon_absolute.ix[item][‘co2_1000’],item) map_data[abbr][‘cname’] = item map_data[abbr][‘population’] = carbon_absolute.ix[item][‘pop’]/1000 for col in columns: map_data[abbr][col] = “%2.2f” %carbon_absolute.ix[item][col]
return map_data, fills
map_data, fills = generate_co2_1000_map() jscode = open(‘javascript_template.js’).read().replace(‘#FILLS#’, json.dumps(fills))
.replace(‘#DATA#’, json.dumps(map_data))
.replace(‘#MAPNAME#’, ‘co2_1000’)open(‘co2_1000.js’, ‘w’).write(jscode)