from __future__ import division from statdata import * from math import sqrt, log from numpy import matrix def stdev(values): return sqrt(len(values) * sum(x ** 2 for x in values) - sum(values) ** 2) def correlation(a, b): return (len(a) * sum(v*w for v, w in zip(a, b)) - sum(a) * sum(b)) / ( stdev(a) * stdev(b)) workstat = dict(l.rsplit(None, 1) for l in oecdworkstat.splitlines()) A, B = [], [] output = [] for cc, state in cc2name.iteritems(): wh = int(workstat[state]) gdp = GDP[cc] A.append([wh*log(gdp), wh, log(gdp), 1]) B.append([prstat[cc] / (population[cc] / 10000000)]) output.append([state, prstat[cc], wh, gdp, population[cc], prstat[cc] / population[cc] * 10000000]) print "workhour-pr/pop correlation:", print correlation([x[2] for x in output], [x[0] for x in B]) print "GDP-pr/pop correlation:", print correlation([x[3] for x in output], [x[0] for x in B]) Aorig, A = A, matrix(A) B = matrix(B) X = (A.H * A).I * A.H * B k1 = float(X[0][0]) k2 = float(X[1][0]) k3 = float(X[2][0]) k4 = float(X[3][0]) expect = lambda *v: k1 * v[0] + k2 * v[1] + k3 * v[2] + k4 * v[3] output.sort(key=lambda x: x[2]) outf = open('result.txt', 'w') for datarow, outputrow in zip(Aorig, output): outputrow.append(expect(*datarow)) print >> outf, "%s\t%d\t%d\t%d\t%d\t%.4f\t%.4f" % tuple(outputrow) print X corr = correlation([float(f[0]) for f in B], [expect(*tuple(v)) for v in Aorig]) print 'r=%.4f' % corr