Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 14 additions & 46 deletions data/downloadInputs.sh
Original file line number Diff line number Diff line change
@@ -1,52 +1,20 @@
#!/bin/bash -e

OLD_DIR=`pwd`
DIR=`dirname $0`
cd $DIR
this_dir=$(dirname "${0}")
input_dir="${this_dir}"/inputs

echo "Note that unzipping is slow."
echo "Downloading dictionary files."

L=en
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
WIKI=${L}wiktionary-latest-pages-articles.xml
curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2
# Downloading wikimedia dump files
for l in en fr it de es pt; do
download_url="http://dumps.wikimedia.org/${l}wiktionary/latest/${l}wiktionary-latest-pages-articles.xml.bz2"
echo "Downloading data for language '${l}' from: ${download_url}"
wget --no-verbose --show-progress "${download_url}" --output-document="${input_dir}/${l}wiktionary-pages-articles.xml.bz2"
done

echo "Downloading from: http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/"
CHEMNITZ=de-en.txt
curl -L --remote-name http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/${CHEMNITZ}.gz
mv ${CHEMNITZ}.gz inputs/de-en_chemnitz.txt.gz
# Downloading de-en from chemnitz.de
download_url='http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/de-en.txt.gz'
echo "Downloading from: ${download_url}"
wget --no-verbose --show-progress "${download_url}" --output-document="${input_dir}/de-en_chemnitz.txt.gz"

L=fr
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
WIKI=${L}wiktionary-latest-pages-articles.xml
curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2

L=it
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
WIKI=${L}wiktionary-latest-pages-articles.xml
curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2

L=de
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
WIKI=${L}wiktionary-latest-pages-articles.xml
curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2

L=es
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
WIKI=${L}wiktionary-latest-pages-articles.xml
curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2

L=pt
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
WIKI=${L}wiktionary-latest-pages-articles.xml
curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2

echo "Done. Now run WiktionarySplitter to split apart enwiktionary."

cd $OLD_DIR
echo "Done. Now run './WiktionarySplitter.sh' to split apart wiktionary dump files."