From d45c8840e9062ec9a92ced3b422ade499260dbc4 Mon Sep 17 00:00:00 2001 From: Quentin Nerden Date: Wed, 5 Jun 2019 15:41:39 +0200 Subject: [PATCH] DRY the code of the downloadinputs.sh script - avoid repetitions in the code (DRY principle) - use wget instead of curl to give more feedback to the user. - avoid a few bash warnings reported by shellcheck. - avoid changing directory within the script, this is error prone. --- data/downloadInputs.sh | 60 ++++++++++-------------------------------- 1 file changed, 14 insertions(+), 46 deletions(-) diff --git a/data/downloadInputs.sh b/data/downloadInputs.sh index 0929f63..113db3b 100755 --- a/data/downloadInputs.sh +++ b/data/downloadInputs.sh @@ -1,52 +1,20 @@ #!/bin/bash -e -OLD_DIR=`pwd` -DIR=`dirname $0` -cd $DIR +this_dir=$(dirname "${0}") +input_dir="${this_dir}"/inputs -echo "Note that unzipping is slow." +echo "Downloading dictionary files." -L=en -echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" -WIKI=${L}wiktionary-latest-pages-articles.xml -curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2 -mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2 +# Downloading wikimedia dump files +for l in en fr it de es pt; do + download_url="http://dumps.wikimedia.org/${l}wiktionary/latest/${l}wiktionary-latest-pages-articles.xml.bz2" + echo "Downloading data for language '${l}' from: ${download_url}" + wget --no-verbose --show-progress "${download_url}" --output-document="${input_dir}/${l}wiktionary-pages-articles.xml.bz2" +done -echo "Downloading from: http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/" -CHEMNITZ=de-en.txt -curl -L --remote-name http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/${CHEMNITZ}.gz -mv ${CHEMNITZ}.gz inputs/de-en_chemnitz.txt.gz +# Downloading de-en from chemnitz.de +download_url='http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/de-en.txt.gz' +echo "Downloading from: ${download_url}" +wget --no-verbose --show-progress "${download_url}" --output-document="${input_dir}/de-en_chemnitz.txt.gz" -L=fr -echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" -WIKI=${L}wiktionary-latest-pages-articles.xml -curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2 -mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2 - -L=it -echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" -WIKI=${L}wiktionary-latest-pages-articles.xml -curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2 -mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2 - -L=de -echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" -WIKI=${L}wiktionary-latest-pages-articles.xml -curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2 -mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2 - -L=es -echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" -WIKI=${L}wiktionary-latest-pages-articles.xml -curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2 -mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2 - -L=pt -echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" -WIKI=${L}wiktionary-latest-pages-articles.xml -curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2 -mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2 - -echo "Done. Now run WiktionarySplitter to split apart enwiktionary." - -cd $OLD_DIR +echo "Done. Now run './WiktionarySplitter.sh' to split apart wiktionary dump files."