China Vs US Vs Middle East - Who smokes more?

2013-12-18 » Julia, Machine Learning, Python

Nothing helps like practice. I’m fiddling around with machine learning and data visualization libraries to become proficient in them. So I wanted to try my hand in coaxing information from data. Thats when I came across CO2 emission index of countries in Quandl, one of the excellent sources of open data. So I thought how would it be to take other attributes of the countries and find a relation between those attributes and CO2 emission index.

Below is the code for scraping the data from Quandl. We are scraping the data because the data we are interested is from the summary page and not individual data source per se. At this point, I’m obligated to mention that quandl provides ability to download data as csv for any timeframe range and your choice of frequency. The same can also be obtained through their excellent python library.

import sys
import os
from lxml import html
import requests
import csv

def main(url, outfile):
    dirname = os.path.dirname(outfile)
    if dirname and not os.path.exists(dirname):
        os.makedirs(dirname)
    source = requests.get(url)
    if not source.status_code == 200:
        print "Got status code", source.status_code, "Exiting"
        sys.exit(1)

    doc = html.fromstring(source.text)
    records = [('country', 'value', 'unit', 'year')]
    table = doc.xpath('//div[@class="datatable"]')[-1] #The last table contains World data
    rows = table.xpath(".//tr")
    print len(rows)
    for row in rows:
        try:
            tds = row.xpath('td')
            if not tds: continue
            country = str((tds[0].xpath('a/@data-sort-value') + 
                            tds[0].xpath('span/@data-sort-value')
                            )[0])
            value = str(tds[1].xpath('span/@data-sort-value')[0])
            if value:
                value = float(value)
            unit = tds[2].text
            year = str(tds[3].xpath('span/@data-sort-value')[0])
            if year:
                year = int(year)
        except:
            import traceback
            print traceback.format_exception(*sys.exc_info())
            print html.tostring(row)
        else:
            records.append((country, value, unit, year))
    csv.writer(open(outfile, 'w')).writerows(records)

if __name__ == "__main__":
    main(sys.argv[1], sys.argv[2])

The above code can be used to download tabular data for ‘All Countries’ from quandl. The data we are interested in are * GDP - All Countries * Fuelwood Production - All Countries * Urban Population - All Countries * Gini Index - All Countries * Passenger Vehicle - All Countries * Cellphone Subscription - All Countries * GDP -PPP - All Countries * Population - All Countries

And there is the main data that we are going to work on : “CO2 Emissions - All Countries”

We are going to find which of the above data best predicts the CO2 emission a country would have.

import matplotlib as plot
import numpy as np
import pandas as pd
from mpltools import style

style.use('ggplot')



from pprint import pprint
from glob import glob
csvs = glob('*.csv')
pprint(csvs)

['cellphone.csv',
 'urbanpop.csv',
 'population.csv',
 'gdp.csv',
 'gdp_ppp.csv',
 'fuelwood.csv',
 'co2_emissions.csv',
 'vehicles.csv',
 'gini.csv']

First lets read in the CO2 emissions for each country and calculate CO2 emission per 1000 people. Then let us bring the rest of the params to the same denomination of 1000 people where possible.

co2_em = pd.read_csv('co2_emissions.csv', index_col=0)
co2_em[:5]
population =  pd.read_csv('population.csv', index_col=0)
population[:5]
data = pd.DataFrame(co2_em['value'])
data['pop'] = population['value'] * 1000 #Population is in millions. We are converting it to 1000s
data['co2_1000'] = data['value']/data['pop']
data[:5]
value pop co2_1000
country
afghanistan 6314.574 32017 0.197226
albania 3006.940 3243 0.927209
algeria 121311.694 36494 3.324154
angola 26655.423 20213 1.318727
antigua and barbuda 462.042 88 5.250477
cellphone_usage = pd.read_csv('cellphone.csv', index_col=0) #cellphone connections per 100 persons
data['cellphone'] = cellphone_usage['value'] * 10 #converting it to per 1000 persons
urbanpop = pd.read_csv('urbanpop.csv', index_col=0) #urban pop in percentage
data['urbanpop'] = urbanpop['value']
gdp = pd.read_csv('gdp.csv', index_col=0) #gdp in billions 
data['gdp'] = gdp['value'] * 1000000000/(data['pop']*1000) #gdp per person
fuelwood = pd.read_csv('fuelwood.csv', index_col=0)#kiloTonnes
data['fuelwood'] = fuelwood['value']* 1000/data['pop'] #fuelwood in tonnes per person
gini = pd.read_csv('gini.csv', index_col=0)
data['gini'] = gini['value']
vehicles = pd.read_csv('vehicles.csv', index_col=0) # vehicles per thousand people
data['vehicles'] = vehicles['value']
gdp_ppp = pd.read_csv('gdp_ppp.csv', index_col=0)
data['gdp_ppp'] = gdp_ppp['value']
data[:10]
value pop co2_1000 cellphone urbanpop gdp fuelwood gini vehicles gdp_ppp
country
afghanistan 6314.574 32017 0.197226 538.969630 23.8554 621.732205 49.896899 27.82 20.112597 1053.813
albania 3006.940 3243 0.927209 1084.473347 54.4472 3912.426765 301.031517 34.51 91.979675 8052.178
algeria 121311.694 36494 3.324154 1033.058644 73.7060 5693.922289 7.184152 35.33 75.872877 7477.069
angola 26655.423 20213 1.318727 486.100024 59.9062 5873.398308 982.806239 42.66 8.000000 6346.742
antigua and barbuda 462.042 88 5.250477 1986.178323 29.8672 13363.636364 NaN NaN 153.276256 18026.662
argentina 174717.882 41028 4.258504 1425.117584 92.6406 11576.338111 113.385980 44.49 NaN 18112.330
armenia 4492.075 3366 1.334544 1068.789619 64.1636 2990.493167 11.883541 31.30 94.000000 5838.258
australia 400194.378 22766 17.578599 1061.928152 89.3372 67723.666872 471.270955 35.19 556.185481 42640.275
austria 62313.331 8466 7.360422 1612.069881 67.8808 47081.738720 733.052303 29.15 529.338286 42408.575
azerbaijan 49075.461 9235 5.314073 1074.721324 53.8886 7450.351922 32.658365 33.71 83.849698 10478.231

Panda’s Dataframe provides a convenient function ‘corr’ that provides the correlation between each and every column of the dataframe.

data.corr()
value pop co2_1000 cellphone urbanpop gdp fuelwood gini vehicles gdp_ppp
value 1.000000 0.795840 0.156284 0.017960 0.100557 0.101115 -0.096348 -0.032174 0.070360 0.116638
pop 0.795840 1.000000 -0.013529 -0.078453 -0.039648 -0.046795 -0.061197 -0.043269 -0.093295 -0.051414
co2_1000 0.156284 -0.013529 1.000000 0.459959 0.483094 0.699502 -0.098430 -0.228570 0.639473 0.764942
cellphone 0.017960 -0.078453 0.459959 1.000000 0.537769 0.401796 -0.071408 -0.092684 0.432711 0.505035
urbanpop 0.100557 -0.039648 0.483094 0.537769 1.000000 0.607351 -0.123386 -0.110808 0.625057 0.670027
gdp 0.101115 -0.046795 0.699502 0.401796 0.607351 1.000000 -0.043769 -0.314087 0.761361 0.945029
fuelwood -0.096348 -0.061197 -0.098430 -0.071408 -0.123386 -0.043769 1.000000 0.120487 -0.169399 -0.054676
gini -0.032174 -0.043269 -0.228570 -0.092684 -0.110808 -0.314087 0.120487 1.000000 -0.423783 -0.275202
vehicles 0.070360 -0.093295 0.639473 0.432711 0.625057 0.761361 -0.169399 -0.423783 1.000000 0.776197
gdp_ppp 0.116638 -0.051414 0.764942 0.505035 0.670027 0.945029 -0.054676 -0.275202 0.776197 1.000000

As we can see from the correlation matrix, plain CO2 emission levels are heavily correlated with population and almost unrelated to the rest. When we discount the population factor by normalizing to get CO2 emission per 1000 people, we find that it somewhat correlated with cellphone connections per 1000 person and urban population percentage. It is well correlated with gdp, vehicles per 1000and gpd_ppp. Now lets try plotting it.

import matplotlib.pyplot as plt
fig = plt.figure(tight_layout=True)
fields = ['cellphone', 'urbanpop', 'gdp', 'fuelwood', 'gini', 'vehicles', 'gdp_ppp']
fig.set_size_inches(12,12) 
for index, field in enumerate(fields):
    ax = fig.add_subplot(3, 3, index+1, xlabel=field, ylabel='CO<sub>2</sub> emissions per 1000 people')
    ax.scatter(data[field], data['co2_1000'])


fig.show()

png

Now lets us remove the records which have NaN as value for CO2 emmisions

clean_data = data[np.isfinite(data['co2_1000'])]
y = clean_data['co2_1000'].values
x = clean_data[['cellphone', 'urbanpop', 'gdp', 'fuelwood', 'gini', 'vehicles', 'gdp_ppp']].values.transpose()
x.shape




(7, 178)

Now lets see which countries have the highest carbon emission per 1000 people.

plt.cool()
sorted_data = clean_data.sort(columns=['co2_1000'], ascending=False)
top50_data = sorted_data[:40]
labels = top50_data.index
co2_1000 = top50_data['co2_1000'].values
figure(figsize=(10,10))
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral']
colors = ['seashell','pink','plum', 'PapayaWhip','violet', 'PeachPuff','MediumOrchid']
plt.pie(co2_1000,  labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
# Set aspect ratio to be equal so that pie is drawn as a circle.
plt.axis('equal')
plt.show()

png

It is interesting to see that the countries that contribute the highest carbon footprint is from middle east followed by the developed countries. So which contribute the least? As expected, countries having the smallest carbon footprint are mostly least developed.

', '.join([x.title() for x in sorted_data[-30:].index])




'Kenya, Laos, Ivory Coast, Togo, Sierra Leone, Haiti, Afghanistan, Guinea-Bissau, Comoros, Myanmar, Timor-Leste, Tanzania, Zambia, Liberia, Mozambique, Nepal, Guinea, Uganda, Burkina Faso, Ethiopia, Eritrea, Madagascar, Niger, Rwanda, Malawi, Central African Republic, Chad, Mali, Congo, Burundi'

Lets make a map of countries and their carbon footprint. We need to map the countries to their 3-letter abbreviations.

Let as look at the distribution of emission amoung the countries. As it is apparent, there is assymmetry in how the carbon emission is distributed.

plt.hist(sorted_data['co2_1000'], color=['green'], bins=15)




(array([91, 23, 29, 18,  4,  4,  2,  1,  2,  1,  0,  1,  0,  0,  2]),
 array([  2.17303704e-02,   2.57036546e+00,   5.11900054e+00,
         7.66763563e+00,   1.02162707e+01,   1.27649058e+01,
         1.53135409e+01,   1.78621760e+01,   2.04108111e+01,
         2.29594461e+01,   2.55080812e+01,   2.80567163e+01,
         3.06053514e+01,   3.31539865e+01,   3.57026216e+01,
         3.82512567e+01]),
 <a list of 15 Patch objects>)

png

This begs the question, who are the biggest absolute contributors?

carbon_absolute = clean_data.sort(columns=['value'], ascending=False)
total_emission = sum(carbon_absolute['value'])
print 'Total Global Emission is:', total_emission
carbon_absolute['percentage'] = 100 * carbon_absolute['value']/total_emission
print carbon_absolute['percentage'][:10]

Total Global Emission is: 29948374.332
country
china           25.667883
usa             17.695662
india            6.609456
russia           5.257000
japan            3.676774
germany          2.452885
iran             2.010311
canada           1.716078
south korea      1.700846
south africa     1.666255
Name: percentage, dtype: float64



plt.hist(carbon_absolute['percentage'], color=['green'], bins=15)




(array([170,   3,   1,   2,   0,   0,   0,   0,   0,   0,   1,   0,   0,
         0,   1]),
 array([  1.71421659e-04,   1.71135221e+00,   3.42253300e+00,
         5.13371379e+00,   6.84489458e+00,   8.55607537e+00,
         1.02672562e+01,   1.19784370e+01,   1.36896177e+01,
         1.54007985e+01,   1.71119793e+01,   1.88231601e+01,
         2.05343409e+01,   2.22455217e+01,   2.39567025e+01,
         2.56678833e+01]),
 <a list of 15 Patch objects>)

png

It is interesting to see that even though the Middle East have the largest per capita contribution, they are insignificant compared to the giants, China, USA and India because of the sheer population size of the latter. China, USA and India together contribute nearly 50% of the world’s carbon emissions. Lets generate a map of the countries of the world by net emissions and by per 1000 people footprint. The code used for generating the required javascript are below the map.

`

Total Emission Map

Emission per 1000 People Map

And here is the code for generating the maps.
def generate_total_emission_map():


    def get_fill(carb_em, index):
        if carb_em >= 10:
            return 'VERYHIGH'
        elif carb_em >= 2:
            return 'HIGH'
        elif carb_em >= 1:
            return 'MEDIUM'
        else:
            return 'LOW'

    fills = {'VERYHIGH':'#FF003C',
             'HIGH':'#FF8A00',
             'MEDIUM':'#FABE28',
             'LOW':'#88C100',
             'VERYLOW':'#00C176',
             'defaultFill':'#87796F'}
    matched_abbr = dict([x for x in csv.reader(open('abbr.csv', 'r'))])
    map_data = defaultdict(lambda: defaultdict(str))
    columns = carbon_absolute.columns

    for index, item in enumerate(carbon_absolute.index):
        abbr = matched_abbr[item]
        map_data[abbr]['rank'] = index + 1
        map_data[abbr]['fillKey'] = get_fill(carbon_absolute.ix[item]['percentage'],item)
        map_data[abbr]['cname'] = item
        map_data[abbr]['population'] = carbon_absolute.ix[item]['pop']/1000
        for col in columns:
            map_data[abbr][col] = "%2.2f" %carbon_absolute.ix[item][col]

    return map_data, fills

    #ratio = /max_carbon
    #fills[abbr] = ratio_to_color(ratio)



import json
map_data, fills = generate_total_emission_map()
jscode = open('javascript_template.js').read().replace('#FILLS#', json.dumps(fills))\
        .replace('#DATA#', json.dumps(map_data))\
        .replace('#MAPNAME#', 'total_emission')

open('total_emission.js', 'w').write(jscode)





def generate_co2_1000_map():


    def get_fill(carb_em, index):
        if carb_em >= 20:
            print 'veryhigh', index
            return 'VERYHIGH'
        elif carb_em >= 10:
            return 'HIGH'
        elif carb_em >= 5:
            return 'MEDIUM'
        elif carb_em >= 2:
            return 'LOW'
        else:
            return 'VERYLOW'

    fills = {'VERYHIGH':'#FF003C',
             'HIGH':'#FF8A00',
             'MEDIUM':'#FABE28',
             'LOW':'#88C100',
             'VERYLOW':'#00C176',
             'defaultFill':'#87796F'}
    matched_abbr = dict([x for x in csv.reader(open('abbr.csv', 'r'))])
    map_data = defaultdict(lambda: defaultdict(str))
    columns = carbon_absolute.columns

    for index, item in enumerate(carbon_absolute.index):
        abbr = matched_abbr[item]
        map_data[abbr]['rank'] = index + 1
        map_data[abbr]['fillKey'] = get_fill(carbon_absolute.ix[item]['co2_1000'],item)
        map_data[abbr]['cname'] = item
        map_data[abbr]['population'] = carbon_absolute.ix[item]['pop']/1000
        for col in columns:
            map_data[abbr][col] = "%2.2f" %carbon_absolute.ix[item][col]

    return map_data, fills


map_data, fills = generate_co2_1000_map()
jscode = open('javascript_template.js').read().replace('#FILLS#', json.dumps(fills))\
        .replace('#DATA#', json.dumps(map_data))\
        .replace('#MAPNAME#', 'co2_1000')

open('co2_1000.js', 'w').write(jscode)
comments powered by Disqus