#!/bin/bash
# the original dataset can be downloaded from http://www.ke.tu-darmstadt.de/resources/eurlex

DIR=original
mkdir $DIR 2> /dev/null

# download html sources
wget -O "$DIR/eurlex_html_EN_NOT.zip" http://www.ke.tu-darmstadt.de/files/resources/eurlex/eurlex_html_EN_NOT.zip
unzip -q "$DIR/eurlex_html_EN_NOT.zip" -d "$DIR/htmls"

# download the id to labels mapping
wget -O "$DIR/eurlex_id2class.zip" http://www.ke.tu-darmstadt.de/files/resources/eurlex/eurlex_id2class.zip
unzip "$DIR/eurlex_id2class.zip" id2class_eurlex_eurovoc.qrels -d $DIR

# download the file which tells us which files are removed
wget  http://www.ke.tu-darmstadt.de/files/resources/eurlex/eurlex_ID_mappings.csv.gz
gzip -d "eurlex_ID_mappings.csv.gz" 
mv eurlex_ID_mappings.csv "$DIR/"

# download
wget http://www.ke.tu-darmstadt.de/files/resources/eurlex/eurlex_tokenstring.arff.gz
gzip -d "eurlex_tokenstring.arff.gz"
mv eurlex_tokenstring.arff "$DIR/"