eurostat/preprocessing/preprocessor.py at master · mic0331/eurostat · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
from collections import defaultdict
from mapping import get_mapping
from pymongo import MongoClient
import ast
import math
import numpy as np

DATADIR = './data/'
DATAFILE = 'earn_nt_net.tsv'
MAPPING = get_mapping()


def parse_file(datafile):
    data = []
    with open(datafile, 'rU') as f:
        header = f.readline().split(",")
        header[3:] = [col for col in header[3].split("\t")]
        header[3] = 'country' # use a friendly name
        for line in f:
            fields = line.split(",")
            fields[3:] = [col for col in fields[3].split("\t")]
            entry = {}
            for i, value in enumerate(fields):
                entry[header[i].strip()] = value.strip()
            data.append(entry)
    return data

def classify_per_countries(raw_data):
    countries = defaultdict(list)

    for line in raw_data:
        countries[line['country']].append(line)

    data = [{'country':k, 'stats':v} for k,v in countries.items()]
    return data

def inject_data_mongo(data, collection):
    # DEV
    client = MongoClient('mongodb://localhost:27017')
    # PROD
    #client = MongoClient('mongodb://mic0331:eurostat@ds047672.mongolab.com:47672/eurostat')
    db = client.eurostat
    db[collection].drop()
    db[collection].insert(data)

def is_float(value):
  try:
    float(value)
    return True
  except ValueError:
    return False

def group_years(data, y_from, y_to):
    for line in data:
#        for row in line['stats']:
            stat = []
            for k,v in line.items():
                if k.isdigit():
                    if int(k)>= y_from and int(k) <= y_to:
                        stat.append({
                            "year": int(k),
                            # mark empty value with 0
                            "data": ast.literal_eval(v) if is_float(v) else float('NaN')
                        })
                        stat = sorted(stat, key=lambda k: k['year'])
            for k,v in list(line.items()):
                if k.isdigit():
                    del line[k]
            line['measure'] = stat
    return data

def fillna(data):
    for line in data:
        for k, v in line.items():
            if k is "measure":
                #f =  lambda x, y: x + y if x != math.isnan(x) else 0
                #mean = reduce(f, [d['data'] for d in v]) / len(v)
                #mean = float(sum(d['data'] for d in v)) / len(v)
                d = [float(d['data']) for d in v]
                mean = 0
                if np.count_nonzero(np.isnan(d)) != len(d):
                    mean = np.nanmean(d)

                for measure in v:
                    if math.isnan(measure['data']):
                        measure['data'] = mean
    return data

def get_desc_for(code, value):
    description = ""
    try:
        k_match = next(k for k in MAPPING if k['feature'] == code)
        description = next(k for k in k_match['codes'] if k['code'] == value )
        description = description['label']
    except:
        print("Didn't find ", code, value)
    return description

def merge_data_label(data):
    for line in data:
#        for row in line['stats']:
            for k, v in line.items():
                if k is not "measure":
                    line[k] = {
                        "code": v,
                        "description": get_desc_for(k, v)
                    }
    return data

if __name__ == "__main__":
    data = []
    datafile = os.path.join(DATADIR, DATAFILE)
    num_lines = sum(1 for line in open(datafile))
    print("Number of lines in the file : {0}".format(num_lines))
    # stage 1 :: parse the file
    data = parse_file(datafile)
    # (optional) stage 2 :: classify the raw data by country
#    data = classify_per_countries(data)
    # stage 3 :: put all the yearly data in it's own sub-feature
    data = group_years(data, y_from=2000, y_to=2014)
    # stage 4 :: label the feature
    data = merge_data_label(data)
    # stage 4 :: replace NaN by the mean
    data = fillna(data)
    # load the data per country
    inject_data_mongo(data, 'eurn_nt_nets')
    # load the mapping table
    inject_data_mongo(get_mapping(), 'mappings')