Setting up PostgreSQL on a local Linux computer

I recently installed Linux on a new machine and had to go through the setup of PostgreSQL again. Here's the steps I did in order to get the database up and running:

1. Install postgres and pgadmin

sudo apt-get updatedmin3
sudo apt-get install postgresql
sudo apt-get install pgadmin3

2. Change password of postgres user

PostgreSQL sets up an admin user (called "postgres") for database maintenance. In order to log in as this user and create the database, change their password:

$ sudo passwd postgres
[sudo] password for phil: 
Enter new UNIX password: 
Retype new UNIX password: 
passwd: password updated successfully

3. Set up the database as postgres user

Next, log in using the postgres user

$ su
Password: 
phil # su postgres
$

As this user, you can start postgres and create a new database. Finally, grant all privileges to your regular user, so that you have full access to the newly created database:

$ psql
 psql (9.3.5)
 Type "help" for help.
 postgres=# create database test;
 CREATE DATABASE
 postgres=# grant all privileges on database test to phil;
 GRANT
 postgres=# \q

You can now log in to the database test with your normal account by typing

$ psql -d test

3-dimensional heatmaps in R

I recently came accross an R package for producing ternary diagrams called ggtern. With it you can produce neat heatmaps to represent 3 dimensions.

Below is the full code used to produce this plot:

Rplot03

 
require(ggplot2)
require(ggtern)
require(MASS) 
require(scales)
require(plyr)

palette <- c( "#FF9933", "#002C54", "#3375B2", "#CCDDEC", "#BFBFBF", "#000000")

# Example data
# sig <- matrix(c(3,0,0,2),2,2)
# data <- data.frame(mvrnorm(n=10000, rep(2, 2), sig))
# data$X1 <- data$X1/max(data$X1)
# data$X2 <- data$X2/max(data$X2)
# data$X1[which(data$X1<0)] <- runif(length(data$X1[which(data$X1<0)]))
# data$X2[which(data$X2<0)] <- runif(length(data$X2[which(data$X2<0)]))

# Print 2d heatmap
heatmap2d <- function(data) {
p <- ggplot(data, aes(x=X1, y=X2)) + 
    stat_bin2d(bins=50) + 
    scale_fill_gradient2(low=palette[4], mid=palette[3], high=palette[2]) +
    xlab("Percentage x") +
    ylab("Percentage y") +
    scale_y_continuous(labels = percent) +
    scale_x_continuous(labels = percent) +
    theme_bw() + theme(text = element_text(size = 15))
print(p)
}

# Example data
# data$X3 <- with(data, 1-X1-X2)
# data <- data[data$X3 >= 0,]

# Auxiliary function for heatmap3d
count_bin <- function(data, minT, maxT, minR, maxR, minL, maxL) {
    ret <- data
    ret <- with(ret, ret[minT <= X1 & X1 < maxT,])
    ret <- with(ret, ret[minL <= X2 & X2 < maxL,])
    ret <- with(ret, ret[minR <= X3 & X3 < maxR,])
    if(is.na(nrow(ret))) {
        ret <- 0
    } else {
        ret <- nrow(ret)
    }
    ret
}

# Plot 3dimensional histogram in a triangle
# See dataframe data for example of the input dataformat
heatmap3d <- function(data, inc, logscale=FALSE, text=FALSE, plot_corner=TRUE) {
#   When plot_corner is FALSE, corner_cutoff determines where to stop plotting
    corner_cutoff = 1
#   When plot_corner is FALSE, corner_number toggles display of obervations in the corners
#   This only has an effect when text==FALSE
    corner_numbers = TRUE
        
    count <- 1
    points <- data.frame()
    for (z in seq(0,1,inc)) {
        x <- 1- z
        y <- 0
        while (x>0) {
            points <- rbind(points, c(count, x, y, z))
            x <- round(x - inc, digits=2)
            y <- round(y + inc, digits=2)
            count <- count + 1
        }
        points <- rbind(points, c(count, x, y, z))
        count <- count + 1
    }
    colnames(points) = c("IDPoint","T","L","R")
    
#   base <- ggtern(data=points,aes(L,T,R)) +
#               theme_bw() + theme_hidetitles() + theme_hidearrows() +
#               geom_point(shape=21,size=10,color="blue",fill="white") +
#               geom_text(aes(label=IDPoint),color="blue")
#   print(base)
  
    polygons <- data.frame()
    c <- 1
#   Normal triangles
    for (p in points$IDPoint) {
        if (is.element(p, points$IDPoint[points$T==0])) {
            next
        } else {
            pL <- points$L[points$IDPoint==p]
            pT <- points$T[points$IDPoint==p]
            pR <- points$R[points$IDPoint==p]
            polygons <- rbind(polygons, 
                        c(c,p),
                        c(c,points$IDPoint[abs(points$L-pL) < inc/2 & abs(points$R-pR-inc) < inc/2]),
                        c(c,points$IDPoint[abs(points$L-pL-inc) < inc/2 & abs(points$R-pR) < inc/2]))    
            c <- c + 1
        }
    }
    
# Upside down triangles
    for (p in points$IDPoint) {
        if (!is.element(p, points$IDPoint[points$T==0])) {
            if (!is.element(p, points$IDPoint[points$L==0])) {
                pL <- points$L[points$IDPoint==p]
                pT <- points$T[points$IDPoint==p]
                pR <- points$R[points$IDPoint==p]
                polygons <- rbind(polygons, 
                            c(c,p),
                            c(c,points$IDPoint[abs(points$T-pT) < inc/2 & abs(points$R-pR-inc) < inc/2]),
                            c(c,points$IDPoint[abs(points$L-pL) < inc/2 & abs(points$R-pR-inc) < inc/2])) 
                c <- c + 1
            }
        }
    }
    
#   IMPORTANT FOR CORRECT ORDERING.
    polygons$PointOrder <- 1:nrow(polygons)
    colnames(polygons) = c("IDLabel","IDPoint","PointOrder")
    
    df.tr <- merge(polygons,points)
    
    Labs = ddply(df.tr,"IDLabel",function(x){c(c(mean(x$T),mean(x$L),mean(x$R)))})
    colnames(Labs) = c("Label","T","L","R")
    
#   triangles <- ggtern(data=df.tr,aes(L,T,R)) +
#                   geom_polygon(aes(group=IDLabel),color="black",alpha=0.25) +
#                   geom_text(data=Labs,aes(label=Label),size=4,color="black") +
#                   theme_bw()
#        print(triangles)
    
    bins <- ddply(df.tr, .(IDLabel), summarize, 
                maxT=max(T),
                maxL=max(L),
                maxR=max(R),
                minT=min(T),
                minL=min(L),
                minR=min(R))
    
    count <- ddply(bins, .(IDLabel), summarize, N=count_bin(data, minT, maxT, minR, maxR, minL, maxL))
    df <- join(df.tr, count, by="IDLabel")
    
    Labs = ddply(df,.(IDLabel,N),function(x){c(c(mean(x$T),mean(x$L),mean(x$R)))})
    colnames(Labs) = c("Label","N","T","L","R")
    
    if (plot_corner==FALSE){
        corner <- ddply(df, .(IDPoint, IDLabel), summarize, maxperc=max(T,L,R))
        corner <- corner$IDLabel[corner$maxperc>=corner_cutoff]
        
        df$N[is.element(df$IDLabel, corner)] <- 0
        if (text==FALSE & corner_numbers==TRUE) {
            Labs$N[!is.element(Labs$Label, corner)] <- ""
            text=TRUE
        }
    }    

    heat <- ggtern(data=df,aes(L,T,R)) +
        geom_polygon(aes(fill=N,group=IDLabel),color="black",alpha=1)
    if (logscale == TRUE) {
            heat <- heat + scale_fill_gradient(name="Observations", trans = "log",
                            low=palette[2], high=palette[4])
    } else {
            heat <- heat + scale_fill_gradient(name="Observations", 
                            low=palette[2], high=palette[4])
    }
    heat <- heat +
        Tlab("x") +
        Rlab("y") +
        Llab("z") +
        theme_bw() + 
        theme(axis.tern.arrowsep=unit(0.02,"npc"), #0.01npc away from ticks ticklength
                    axis.tern.arrowstart=0.25,axis.tern.arrowfinish=0.75,
                    axis.tern.text=element_text(size=12),
                    axis.tern.arrow.text.T=element_text(vjust=-1),
                    axis.tern.arrow.text.R=element_text(vjust=2),
                    axis.tern.arrow.text.L=element_text(vjust=-1),
                    axis.tern.arrow.text=element_text(size=12),
                    axis.tern.title=element_text(size=15))
    if (text==FALSE) {
        print(heat)
    } else {
        print(heat + geom_text(data=Labs,aes(label=N),size=3,color="white"))
    }
}

# Usage examples
# heatmap3d(data, 0.2, text=TRUE)
# heatmap3d(data, 0.05)
# heatmap3d(data, 0.1, text=FALSE, logscale=TRUE)
# heatmap3d(data, 0.1, text=TRUE, logscale=FALSE, plot_corner=FALSE)
# heatmap3d(data, 0.1, text=FALSE, logscale=FALSE, plot_corner=FALSE)

Measuring PostgreSQL performance

Since psql performance was not really up to speed at the server I am currently using, I had a look at how to measure and optimize its performance. PostgreSQL comes with a neat benchmarking tool to help you tweak the performance settings to your environment.

Settings can be changed in your postgresql.conf file, the easiest way to change these is via the Backend Configuration Editor in PgAdmin. To see the currently used setting, use for example

SELECT name, setting, unit FROM   pg_settings WHERE  name = 'shared_buffers';

In order to reload the settings after a change without restarting the server, you can use the command

SELECT pg_reload_conf();

Here is how I used pgbench on the Windows Enterprise Server to get a sense of the PostgreSQL performance.

1. Create a new benchmarking database in PgAdmin. I called it "pgbench".

2. Create benchmarking tables using pgbench -i. Here's the command I used in Windows:

./pgbench.exe -i -s 1 -h [servername] -U [username] pgbench

The command line argument -s is the scaling factor used. I ran the test for a number of scaling factors between 1 and 1000 to see where there are drops in performance and how they relate to the database size.

3. In order to check the database size, use the following command:

select datname,pg_size_pretty(pg_database_size(oid)) from pg_database where datname = 'gpbench';

4. Run benchmarking

./pgbench.exe -t 2000 -c 8 -S -h [servername] -U [username] pgbench

The final line in the output is the number of transactions per second, which is what we use to measure performance.

 

psql_performance

Downloading historical timeseries in Python Pandas

I have been meaning to fetch historic price data for ETFs for a project in Python, and realized this is not as easy as I thought. In this and in the next couple of posts I will go through my attempts of downloading the data and explain why I had to give up on this method.

First up: Importing the data directly from finance.yahoo.com into pandas.

import pandas as pd
import numpy as np
from pandas.io.data import DataReader
from datetime import datetime
from dateutil.relativedelta import relativedelta


HORIZON = 3 # All figures are based on timeseries dating back 3 years


start_date = datetime.today() - relativedelta(years=HORIZON)

asset_names = ["^FTSE", "GSG", "TIP", "IYY"]
assets = []
asset_returns = pd.DataFrame(columns = asset_names)

for i in range(len(asset_names)):
data = DataReader(asset_names[i], "yahoo", start=start_date)
daily_returns = data['Close'].diff() / data['Close']
asset_returns[asset_names[i]] = daily_returns
assets.append({'name':  asset_names[i],
'mean':  np.mean(daily_returns),
'stdev': np.std(daily_returns)
})

This works beautifully. I ran into issues because the data quality and availability on yahoo is not great, and I had to look for other sources. But if you are looking for a quick and easy way of getting some financial timesieries into Python, pandas.io.data.DataReader is almost impossible to beat.

Generating links recursively and removing a substring from the filename

I recently encountered the following problem:

A folder contains a number of subdirectories, each with a large number of *.txt files. The filenames all contain dot-separated information regarding the files contents. Among other things, the file name contains a string describing a discount curve used for a calculation. It can be either of these strings: LIBOR 6M_HUF 1M_EUR 3M_EUR 3M_HUF 3M_CHF 6M_EUR 1M_USD 3M_RON 6M_CHF OI_EUR OI_CHF 6M_USD 3M_USD 1M_CHF 3M_CZK 6M_CZK 1M_CZK 3M_JPY 6M_JPY 12M_EUR 6M_PLN 3M_PLN 3M_RUB EURIBOR OI_USD MANUAL 12M_USD 3M_GBP 3M_TRY 3M_CAD. I needed to create links for all files such that this string is removed without changing anything else, i.e. the file dir/subdir1/test.file.3M_CAD.txt is linked from dir/subdir1/test.file..txt.

Here's how I did it in bash:

echo "LIBOR 6M_HUF 1M_EUR 3M_EUR 3M_HUF 3M_CHF 6M_EUR 1M_USD 3M_RON 6M_CHF OI_EUR OI_CHF 6M_USD 3M_USD 1M_CHF 3M_CZK 6M_CZK 1M_CZK 3M_JPY 6M_JPY 12M_EUR 6M_PLN 3M_PLN 3M_RUB EURIBOR OI_USD MANUAL 12M_USD 3M_GBP 3M_TRY 3M_CAD" > curves.txt
find . -type d > listdir.txt
dir=`pwd`
for c in `cat curves.txt`; do
    for d in `cat listdir.txt`; do 
        cd $d; 
        find . -maxdepth 1 -name "*$c*.txt" -exec bash -c 'ln -sf $0 ${0/'"$c"'/}' {} \;;
        cd $dir; 
    done;
    echo $c; 
done;
echo "Done."

Loading messy data sets in R

To look at how to import data, clean it and get some useful information out of it, we'll be looking at a messy data set containing information from world wide disasters in the last 100 years. The data can be found here.

After saving it we want to import it in R. It is tab separated and includes header, sor this might be a good try:

> dis

R trips up because there are more data columns than names in the header. We can import the data easily without the header, remove the last column and add the header information in R to the data frame:

> dis  dis  names(dis) 

Linear regression in R: Interpreting the summary

When performing a linear regression in R, the program outputs a lot of relevant information when you call summary(). In this post we'll go through all the figures and discuss how to interpret them. An Excel sheet containing all the calculations is available here.

To get started, we perform a regression on the Boston data set, which is part of the MASS package:

> library(MASS)
> names(Boston)
[1] "crim" "zn" "indus" "chas" "nox" "rm" "age" 
[8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
> ?Boston

We will try to predict the median value of owner-occupied homes (medv) only based on the crome rate:

> summary(boston.lm.fit)
Call:
lm(formula = medv ~ crim, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max
-16.957  -5.449  -2.007   2.512  29.800

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept) 24.03311    0.40914   58.74

Based on this output, let's go through all the values and see what they tell us. First let's have a look at a plot of this data:

There's definitely a trend here (higher crime rate corresponds to lower house prices), as expected but only up to a point. After a crime rate of about 40, the median house prices remain roughly constant. There is also a more significant problem with our regression: We predict negative house prices for areas with very high crime rates. We will see how to address these two issues in subsequent posts.

Call
This line shows the call made to calculate the regression. In this case this is not really helpful, but when the call is more complicated and includes higher order and interaction terms it is helpful to have this stored somewhere.

Residuals
The residuals are defined as the difference between actual and predicted value. If our prediction showed no bias, then these residuals would be distributed evenly around zero (just as many predictions are too low and too high). The R output gives a good first impression of this distribution. A good rule of thumb is that the median should be close to zero and  the absolute values of the 1Q and 3Q values should be approximately identical. It seems that in this case, the residuals are not distributed evenly around zero, indicating problems with the fit.

Coefficients
These are the actual coefficients for the intercept and the predictor variables along with their standard error, fitted using least squares. If the absolute value of the estimate is much larger than the standard error, there's a good chance that the real coefficient is actually zero. The t value calculates exactly that, it is defined as

 t value = Estimate / Std.Error.

As rule of thumb, the t value should be larger than 2 and the bigger the value, the better. The p value gives the probability that the t value lies between -2.29 and +2.29. If the probability is very small, then there is virtually no possibility that there is no relationship between the predictor and the dependent variable.

Signif. codes
The symbols show the significance level. As a general rule of thumb, the p value should be at most 0.05. R shows this by attaching one (*) to three (***) stars next to the predictor variable to show the significance.

Residual standard error and degrees of freedom
The degrees of freedom are the number of observations minus the number of parameters. In our case there are 506 observations and 2 parameters (one for the intercept and one for the crim variable). The more degrees of freedom, the less likely we are to overfit.
The residual standard error is defined as

 RSE = sqrt(RSS/df),

where the RSS is the Residual Sum of Squares (the sum of the squares of the residuals). The residual standard error gives an indication "how wrong" the prediction is, on average.

(Adjusted) R-squared
The R-squared gives the percentage of the total variation which is explained by the model, i.e. R^2 = 1 - RSS/TSS. RSS is the Residual Sum of Squares, as above. TSS is the Total Sum of Squares, that is the sum of the squared differences between the variable and its mean. TSS measures the variability given in the data.
Adjusted R^2 also incorporates the number of parameters:

 Adj.R^2 = 1 - (RSS/df) / (TSS/(N-1)),

where N is the number of observations (in our case 506). If there is no danger of overfitting, the Asjusted R^2 should be very close to the R^2.

F-statistic
The F-statistic tests the hypothesis that all parameters are zero. This is more useful in the case of multiple regression, where it gives us a general indication of whether the complete model is any good. The F statistic is calculated with the following formula:

F= ((TSS-RSS)/N) / (RSS/(N - k - 1))

Pig & mongo-hadoop on a local ubuntu cluster

I had a surprisingly hard time getting pig and mongo-hadoop to work on my local ubuntu machine. In this post I'll go through the steps of installing pig-0.12.0 and MongoDB 2.2.3 locally. Code which I used to make sure everything is running correctly can be found at https://github.com/phildeutsch.

I will install everything in $HOME/hadoop.

Installing pig

wget http://mirror2.klaus-uwe.me/apache/pig/stable/pig-0.12.0.tar.gz
tar xzf pig-0.12.0.tar.gz
cd pig-0.12.0
ant 
cd contrib/piggybank/java
ant
cd ~/hadoop/pig-0.12.0
ant clean jar-all -Dhadoopversion=23

You also need to add the path of pig-0.12.0 to your .bashrc file and source it: Add this line to ~/.bashrc:

export PATH=$PATH:[path to pig-0.12.0]

and type

source ~/.bashrc

Installing MongoDB

For this i followd the guide at http://docs.mongodb.org/manual/tutorial/install-mongodb-on-ubuntu/:

apt-get install mongodb-10gen=2.2.3
echo "mongodb-10gen hold" | sudo dpkg --set-selections

Make sure that everything is running OK by typing "mongo", which should get you to the MongoDB shell.

Downloading the MongoDB Java driver

cd hadoop
wget https://github.com/downloads/mongodb/mongo-java-driver/mongo-2.10.1.jar

Installing mongo-hadoop

cd hadoop
git clone https://github.com/mongodb/mongo-hadoop
cd mongo-hadoop
./sbt package

To push data from Pig to MongoDB, you need to register the three .jars by adding the following three lines at the beginning of your .pig script:

REGISTER [.../]hadoop/mongo-2.10.1.jar 
REGISTER [.../]hadoop/mongo-hadoop/core/target/mongo-hadoop-core_2.2.0-1.2.0.jar
REGISTER [.../]hadoop/mongo-hadoop/pig/target/mongo-hadoop-pig_2.2.0-1.2.0.jar

Email data analysis

In order to learn some more about data analytics, I am working Russel Jurneys' book "Agile Data Science". In it, he uses data downloaded from gmail to illustrate the principles and helpfully he set up a github for all code used throughout his book (https://github.com/rjurney/Agile_Data_Code). I will be analysing data obtained from Microsoft Outlook. Since getting the data prepared was a bit of a hassle, I will document the steps here.

1) Export all emails into a .pst file.

2) Get readpst and transform the data into mbox format. On Ubuntu, this should be as easy as typing

sudo apt-get install pst-utils
readpst -r emails.pst

This creates an mbox file for each folder in the pst file containing all emails in the folder.

3) Reading the mbox file in python is pretty easy once you now about the mailbox module:

import mailbox

mboxfile='mbox'
m = []
for message in mailbox.mbox(mboxfile):
    m.append(message)
m[0].keys()
m[0]['Subject']

The next step will be getting all those mails from the mbox files into an avro schema.

Tutorial: Buying XCP for BTC

This post is somewhat offtopic, but in the last couple of days I've been playing around with counterpartyd, the reference implementation of the counterparty protocol. It took me a while to figure everything out, so I thought I'd post a short tutorial to make it easier for anyone out there trying to do the same.

Counterparty is a protocol for a distributed, open source financial marketplace. With it, you can issue and trade assets, make broadcasts and make bets. Everything is decentralized, open source and built on top of the bitcoin blockchain. Sounds pretty good so far!

The only downside is, that in order to use the exchange, you need an altcoin, called XCP. It is used for issuing assets and making bets as well as for payouts. Although it can be exchanged for BTC directly through the counterpartyd client, that process is not exactly straightforward.

At the moment, the implementation is very much at an alpha stage. There is no GUI, so everything must be done via the terminal, and sometimes it throws not so helpful errors. But it works - Today I bought some XCP and I will explain every step in this post.

First install bitcoind and counterpartyd. This part is very well explained in the docs and I had no problems worth mentioning getting everything ready. Once everything is installed start the bitcoind server and the counterpartyd server:

$ bitcoind
$ Bitcoin server starting
$ counterpartyd
Status: RESTART
Block: 288486
Block: 288487
Block: 288488

You might have to wait until the whole blockchain is indexed by counterpartyd. As soon as the number of blocks is the same as the one given by "bitcoind getinfo | grep blocks", you are good to go. Open a second terminal window and leave the server running in the background. It'll give you useful information regarding order matches and so on later.

Next, make send the bitcoins you want to send to a new address. You can check if they have arrived using by typing "bitcoind listreceivedbyaddress".  To make sure the counterpartyd client is aligned, type "counterpartyd balances <address>".

Figure out how many XCP you would like to receive for your BTC. You can check out the last price on blockscan. When I made the trade, the price was 0.01 BTC/XCP, and since I was going to spend 0.5 BTC, I'd expect to get 50 XCP. On that site you can also see the latest transactions, the current order book and so on. To place the order, type

counterpartyd order --source <address> --get-quantity 50 --get-asset XCP --give-quantity 0.5 --give-asset BTC --expiration 100 --fee-fraction-provided 0.001

The expiration is given in blocks (i.e. 100 blocks translates to roughly 16 hours). When buying BTC you need to provide a fee. I tried 0.0001 which resulted in an error, so I increased it to 0.001 and everything worked nicely.

After entering the order, you need to wait a little until it gets included in the next block (approximately 10 minutes). After that time, it should show up bith in your server window and on blockscan. There you can also click on "[View Order Matches"] to show you if it (or parts of it) have been matched. If there are matches, you will get the XCP as soon as you have paid the required BTC amount.

To do this, look in your counterpartyd server terminal windows for a line starting with "Order Match: " and compare it with the information on blockscan. It shows the amounts (both in BTC and XCP) and the transaction hash. Copy that into the clipboard.

In the other window, enter the command

btcpay --order-match-id <transaction hash>

Finally, to make sure the XCP have been credited to you, enter "counterpartyd wallet" to see the balances and the address.