From d7b5148b989cfd4ca38a1e21aa6cac651486e048 Mon Sep 17 00:00:00 2001
From: Kjell Wooding <kjell@wooding.org>
Date: Wed, 11 Jul 2018 20:32:13 -0400
Subject: [PATCH 1/3] more helper functions for adding datasets

---
 .../data/datasets.py                          | 13 +++++++
 .../data/utils.py                             | 34 +++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py
index 7ad8367..53313cc 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py	
@@ -17,6 +17,7 @@
 jlmem = Memory(cachedir=str(interim_data_path))
 
 def new_dataset(*, dataset_name):
+    global dataset_raw_files
 
     dset = Bunch()
     dset['metadata'] = {}
@@ -33,6 +34,18 @@ def new_dataset(*, dataset_name):
 
     return dset
 
+def add_dataset_by_urllist(dataset_name, url_list):
+    """Add a new dataset by specifying a url_list
+
+    url_list is a list of dicts keyed by:
+        * url, hash_type, hash_value, name, file_name
+    """
+    global dataset_raw_files
+
+    dataset_raw_files[dataset_name] = {'url_list': url_list}
+    write_dataset()
+    dataset_raw_files = read_datasets()
+    return dataset_raw_files[dataset_name]
 
 @jlmem.cache
 def load_dataset(dataset_name, return_X_y=False, **kwargs):
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py
index 78c13c5..50cb6bd 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py	
@@ -77,6 +77,40 @@ def fetch_files(force=False, dst_dir=None, **kwargs):
         result_list.append(fetch_file(force=force, dst_dir=dst_dir, **url_dict))
     return all([r[0] for r in result_list]), result_list
 
+def fetch_text_file(url, file_name=None, dst_dir=None, force=True, **kwargs):
+    """Fetch a text file (via URL) and return it as a string.
+
+    Arguments
+    ---------
+
+    file_name:
+        output file name. If not specified, use the last
+        component of the URL
+    dst_dir:
+        directory to place downloaded files
+    force: boolean
+        normally, the URL is only downloaded if `file_name` is
+        not present on the filesystem, or if the existing file has a
+        bad hash. If force is True, download is always attempted.
+
+    In addition to these options, any of `fetch_file`'s keywords may
+    also be passed
+
+    Returns
+    -------
+    fetched string, or None if something went wrong with the download
+    """
+    retlist = fetch_file(url, file_name=file_name, dst_dir=dst_dir,
+                         force=force, **kwargs)
+    if retlist[0]:
+        status, filename, hashval = retlist
+        with open(filename, 'r') as txt:
+            return txt.read()
+    else:
+        logger.warning(f'fetch of {url} failed with status: {retlist[0]}')
+        return None
+
+
 def fetch_file(url,
                file_name=None, dst_dir=None,
                force=False,

From b7d4009504fe7d55bc2cd7a59a3ca2a9f0032d0f Mon Sep 17 00:00:00 2001
From: Kjell Wooding <kjell@wooding.org>
Date: Thu, 12 Jul 2018 05:24:16 -0400
Subject: [PATCH 2/3] unpack plain tarfiles too

---
 .../{{ cookiecutter.module_name }}/data/utils.py               | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py
index 50cb6bd..7fae0cc 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py	
@@ -220,6 +220,9 @@ def unpack(filename, dst_dir=None, create_dst=True):
     elif path.endswith('.tar.bz2') or path.endswith('.tbz'):
         archive = True
         opener, mode = tarfile.open, 'r:bz2'
+    elif path.endswith('.tar'):
+        archive = True
+        opener, mode = tarfile.open, 'r'
     elif path.endswith('.gz'):
         opener, mode = gzip.open, 'rb'
         outfile, outmode = path[:-3], 'wb'

From 84669eff72c5d63236d2d2e859a5506aa5eac4a1 Mon Sep 17 00:00:00 2001
From: Kjell Wooding <kjell@wooding.org>
Date: Thu, 12 Jul 2018 05:26:14 -0400
Subject: [PATCH 3/3] add an example dataset (and pull in pandas)

---
 {{ cookiecutter.repo_name }}/environment.yml  |   1 +
 .../notebooks/00-add-example-dataset.ipynb    | 706 ++++++++++++++++++
 2 files changed, 707 insertions(+)
 create mode 100644 {{ cookiecutter.repo_name }}/notebooks/00-add-example-dataset.ipynb

diff --git a/{{ cookiecutter.repo_name }}/environment.yml b/{{ cookiecutter.repo_name }}/environment.yml
index e06b4c2..540e40c 100644
--- a/{{ cookiecutter.repo_name }}/environment.yml	
+++ b/{{ cookiecutter.repo_name }}/environment.yml	
@@ -24,6 +24,7 @@ dependencies:
   - scikit-learn
   - joblib
   - nb_conda
+  - pandas
 {% if cookiecutter.python_interpreter == 'python3' -%}
 {{ py3dep()|indent(2, true) }}
 {% else -%}
diff --git a/{{ cookiecutter.repo_name }}/notebooks/00-add-example-dataset.ipynb b/{{ cookiecutter.repo_name }}/notebooks/00-add-example-dataset.ipynb
new file mode 100644
index 0000000..688374b
--- /dev/null
+++ b/{{ cookiecutter.repo_name }}/notebooks/00-add-example-dataset.ipynb	
@@ -0,0 +1,706 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from phoneme.data import datasets\n",
+    "from phoneme.data.utils import build_dataset_dict, hash_file, fetch_and_unpack, fetch_text_file\n",
+    "from phoneme.paths import raw_data_path\n",
+    "from functools import partial\n",
+    "from phoneme.paths import interim_data_path, raw_data_path\n",
+    "\n",
+    "from pprint import pprint"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Adding the LVQ-PAK Finnish Phonetic dataset\n",
+    "\n",
+    "The Learning Vector Quantization project includes a simple Finnish phonetic dataset\n",
+    "consisting (presumably) of 20-dimensional "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_name='lvq-pak'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "************************************************************************\n",
+      "*                                                                      *\n",
+      "*                              LVQ_PAK                                 *\n",
+      "*                                                                      *\n",
+      "*                                The                                   *\n",
+      "*                                                                      *\n",
+      "*                   Learning  Vector  Quantization                     *\n",
+      "*                                                                      *\n",
+      "*                          Program  Package                            *\n",
+      "*                                                                      *\n",
+      "*                   Version 3.1 (April 7, 1995)                        *\n",
+      "*                                                                      *\n",
+      "*                          Prepared by the                             *\n",
+      "*                    LVQ Programming Team of the                       *\n",
+      "*                 Helsinki University of Technology                    *\n",
+      "*           Laboratory of Computer and Information Science             *\n",
+      "*                Rakentajanaukio 2 C, SF-02150 Espoo                   *\n",
+      "*                              FINLAND                                 *\n",
+      "*                                                                      *\n",
+      "*                      Copyright (c) 1991-1995                         *\n",
+      "*                                                                      *\n",
+      "************************************************************************\n",
+      "*                                                                      *\n",
+      "*  NOTE: This program package is copyrighted in the sense that it      *\n",
+      "*  may be used for scientific purposes. The package as a whole, or     *\n",
+      "*  parts thereof, cannot be included or used in any commercial         *\n",
+      "*  application without written permission granted by its producents.   *\n",
+      "*  No programs contained in this package may be copied for commercial  *\n",
+      "*  distribution.                                                       *\n",
+      "*                                                                      *\n",
+      "*  All comments concerning this program package may be sent to the     *\n",
+      "*  e-mail address 'lvq@nucleus.hut.fi'.                                *\n",
+      "*                                                                      *\n",
+      "************************************************************************\n",
+      "\n",
+      "This package contains all the programs necessary for the correct\n",
+      "application of certain LVQ (Learning Vector Quantization) algorithms\n",
+      "in an arbitrary statistical classification or pattern recognition\n",
+      "task.  To this package four options for the algorithms, the\n",
+      "LVQ1, the LVQ2.1, the LVQ3 and the OLVQ1, have been selected.  \n",
+      "\n",
+      "In the implementation of the LVQ programs we have tried to use as\n",
+      "simple a code as possible.  Therefore the programs are supposed to\n",
+      "compile in various machines without any specific modifications made on\n",
+      "the code.  All programs have been written in ANSI C.\n",
+      "\n",
+      "The lvq_pak program package includes the following files:\n",
+      "  - Documentation:\n",
+      "      README             this file\n",
+      "      lvq_doc.ps         documentation in (c) PostScript format\n",
+      "      lvq_doc.ps.Z       same as above but compressed\n",
+      "      lvq_doc.txt        documentation in ASCII format\n",
+      "  - Source file archives:\n",
+      "      lvq_p3r1.exe       Self-extracting MS-DOS archive file\n",
+      "      lvq_pak-3.1.tar    UNIX tape archive file\n",
+      "      lvq_pak-3.1.tar.Z  same as above but compressed\n",
+      "\n",
+      "Installation in UNIX (in more detail, see lvq_doc.ps/txt):\n",
+      "  - Uncompress lvq_pak-3.1.tar.Z\n",
+      "  - Extract the files with \"tar xovf lvq_pak-3.1.tar\" which creates\n",
+      "    the subdirectory lvq_pak-3.1\n",
+      "  - Copy makefile.unix to the name makefile\n",
+      "  - Revise switches in the makefile, if necessary\n",
+      "  - Execute \"make\"\n",
+      "\n",
+      "Installation in MS-DOS (in more detail, see lvq_doc.ps/txt):\n",
+      "  - By executing the command lvq_p3r1 the self-extracting archive\n",
+      "    creates the directory lvq_pak.3r1 and extracts all the files in it\n",
+      "  - You are supposed to use Borland C++ Version 3.1 and to have\n",
+      "    all the necessary environment settings\n",
+      "  - Copy the file makefile.dos to the name makefile\n",
+      "  - Revise the compiler switches in the makefile, if necessary\n",
+      "  - Execute \"make\"\n",
+      "\n",
+      "Revision history:\n",
+      "  - Version 1.0 was released 19 December 1991.\n",
+      "  - Version 1.1 containing only a minor bug fix in memory allocation\n",
+      "    was released 31 December 1991.\n",
+      "  - Version 2.0 containing major modifications in the algorithms was\n",
+      "    released January 31, 1992.\n",
+      "  - Version 2.1 containing some improvements in the speed of algorithms\n",
+      "    and one new program was released October 9, 1992.\n",
+      "  - Version 3.0 containing many advanced features conserning application\n",
+      "    of the algorithms in large problems was released March 1, 1995; for\n",
+      "    these changes see documentation.\n",
+      "  - Version 3.1 containing only a bug fix in random ordering\n",
+      "    was released 7 April 1995.\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(fetch_text_file('http://www.cis.hut.fi/research/lvq_pak/README', file_name=f'{dataset_name}.readme'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Librispeech includes hashes on their website, so we should download them and have a look."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Grab the source code package\n",
+    "lvq_pak = build_dataset_dict(url=\"http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you download files with names DESCR or LICENSE, they will be used as the description and license text respectively.\n",
+    "Usually you will want to give these unique names, so they don't clash with other downloaded files. (e.g. \"LICENSE.txt\" is a terrible name to use)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "descr = build_dataset_dict(url='http://www.cis.hut.fi/research/lvq_pak/README', file_name=f'{dataset_name}.readme',\n",
+    "                       name='DESCR')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 38518594\r\n",
+      "drwx------@ 1 kjell  staff        16384 12 Jul 04:01 \u001b[34m.\u001b[m\u001b[m\r\n",
+      "drwx------  1 kjell  staff        16384 10 Jul 10:35 \u001b[34m..\u001b[m\u001b[m\r\n",
+      "-rwx------@ 1 kjell  staff         6148 10 Jul 09:31 \u001b[31m.DS_Store\u001b[m\u001b[m\r\n",
+      "-rwx------  1 kjell  staff      3769022 12 Jul 03:52 \u001b[31matt_faces.zip\u001b[m\u001b[m\r\n",
+      "-rwx------  1 kjell  staff    337926286 11 Jul 10:00 \u001b[31mdev-clean.tar.gz\u001b[m\u001b[m\r\n",
+      "-rwx------  1 kjell  staff    161980953 11 Jul 08:48 \u001b[31mfsew0_v1.1.tar.gz\u001b[m\u001b[m\r\n",
+      "-rwx------@ 1 kjell  staff  18859619587 10 Jul 06:27 \u001b[31mhm2014-data.zip\u001b[m\u001b[m\r\n",
+      "-rwx------  1 kjell  staff          600 11 Jul 20:22 \u001b[31mlibrispeech.hashes\u001b[m\u001b[m\r\n",
+      "-rwx------  1 kjell  staff        18652 11 Jul 10:06 \u001b[31mlibrispeech.license\u001b[m\u001b[m\r\n",
+      "-rwx------  1 kjell  staff         4958 12 Jul 05:24 \u001b[31mlvq-pak.readme\u001b[m\u001b[m\r\n",
+      "-rwx------  1 kjell  staff       747520 12 Jul 04:01 \u001b[31mlvq_pak-3.1.tar\u001b[m\u001b[m\r\n",
+      "-rwx------  1 kjell  staff         8448 10 Jul 10:17 \u001b[31mmocha-timit.descr\u001b[m\u001b[m\r\n",
+      "-rwx------  1 kjell  staff         2130 10 Jul 10:17 \u001b[31mmocha-timit.license\u001b[m\u001b[m\r\n",
+      "-rwx------  1 kjell  staff    196239360 11 Jul 08:46 \u001b[31mmsak0_v1.1.tar\u001b[m\u001b[m\r\n",
+      "-rwx------  1 kjell  staff    161159737 11 Jul 08:46 \u001b[31mmsak0_v1.1.tar.gz\u001b[m\u001b[m\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "# notice the files have been downloaded to the RAW directory\n",
+    "!ls -la ../data/raw"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'url': 'http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar',\n",
+       "  'hash_type': 'sha1',\n",
+       "  'hash_value': '86024a871724e521341da0ffb783956e39aadb6e',\n",
+       "  'name': None,\n",
+       "  'file_name': None},\n",
+       " {'url': 'http://www.cis.hut.fi/research/lvq_pak/README',\n",
+       "  'hash_type': 'sha1',\n",
+       "  'hash_value': '138b69cc0b4e02950cec5833752e50a54d36fd0f',\n",
+       "  'name': 'DESCR',\n",
+       "  'file_name': 'lvq-pak.readme'}]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# This is what the generated URL list looks like\n",
+    "url_list = [lvq_pak, descr]\n",
+    "url_list"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Add the url list to the dataset dictionary\n",
+    "Eventually, you will need to add a function to process the data.\n",
+    "If you don't specify one, a generic function is used that just sets the\n",
+    "LICENSE and DESCR text if possible\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'load_function': functools.partial(<function new_dataset at 0x11f66a840>, dataset_name='lvq-pak'),\n",
+      " 'load_function_name': 'new_dataset',\n",
+      " 'load_function_options': {'dataset_name': 'lvq-pak'},\n",
+      " 'url_list': [{'file_name': None,\n",
+      "               'hash_type': 'sha1',\n",
+      "               'hash_value': '86024a871724e521341da0ffb783956e39aadb6e',\n",
+      "               'name': None,\n",
+      "               'url': 'http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar'},\n",
+      "              {'file_name': 'lvq-pak.readme',\n",
+      "               'hash_type': 'sha1',\n",
+      "               'hash_value': '138b69cc0b4e02950cec5833752e50a54d36fd0f',\n",
+      "               'name': 'DESCR',\n",
+      "               'url': 'http://www.cis.hut.fi/research/lvq_pak/README'}]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Add this key into the (internal) dataset dictionary\n",
+    "newds_dict = datasets.add_dataset_by_urllist(dataset_name, url_list)\n",
+    "pprint(newds_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Now, call the (generic) load function and notice that the LICENSE and DESCR have been set\n",
+    "dset = newds_dict['load_function']()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "************************************************************************\n",
+      "*                                                                      *\n",
+      "*                              LVQ_PAK                                 *\n",
+      "*                                                                      *\n",
+      "*                                The                                   *\n",
+      "*                                                                      *\n",
+      "*                   Learning  Vector  Quantization                     *\n",
+      "*                                                                      *\n",
+      "*                          Program  Package                            *\n",
+      "*                                                                      *\n",
+      "*                   Version 3.1 (April 7, 1995)                        *\n",
+      "*                                                                      *\n",
+      "*                          Prepared by the                             *\n",
+      "*                    LVQ Programming Team of the                       *\n",
+      "*                 Helsinki University of Technology                    *\n",
+      "*           Laboratory of Computer and Information Science             *\n",
+      "*                Rakentajanaukio 2 C, SF-02150 Espoo                   *\n",
+      "*                              FINLAND                                 *\n",
+      "*                                                                      *\n",
+      "*                      Copyright (c) 1991-1995                         *\n",
+      "*                                                                      *\n",
+      "************************************************************************\n",
+      "*                                                                      *\n",
+      "*  NOTE: This program package is copyrighted in the sense that it      *\n",
+      "*  may be used for scientific purposes. The package as a whole, or     *\n",
+      "*  parts thereof, cannot be included or used in any commercial         *\n",
+      "*  application without written permission granted by its producents.   *\n",
+      "*  No programs contained in this package may be copied for commercial  *\n",
+      "*  distribution.                                                       *\n",
+      "*                                                                      *\n",
+      "*  All comments concerning this program package may be sent to the     *\n",
+      "*  e-mail address 'lvq@nucleus.hut.fi'.                                *\n",
+      "*                                                                      *\n",
+      "************************************************************************\n",
+      "\n",
+      "This package contains all the programs necessary for the correct\n",
+      "application of certain LVQ (Learning Vector Quantization) algorithms\n",
+      "in an arbitrary statistical classification or pattern recognition\n",
+      "task.  To this package four options for the algorithms, the\n",
+      "LVQ1, the LVQ2.1, the LVQ3 and the OLVQ1, have been selected.  \n",
+      "\n",
+      "In the implementation of the LVQ programs we have tried to use as\n",
+      "simple a code as possible.  Therefore the programs are supposed to\n",
+      "compile in various machines without any specific modifications made on\n",
+      "the code.  All programs have been written in ANSI C.\n",
+      "\n",
+      "The lvq_pak program package includes the following files:\n",
+      "  - Documentation:\n",
+      "      README             this file\n",
+      "      lvq_doc.ps         documentation in (c) PostScript format\n",
+      "      lvq_doc.ps.Z       same as above but compressed\n",
+      "      lvq_doc.txt        documentation in ASCII format\n",
+      "  - Source file archives:\n",
+      "      lvq_p3r1.exe       Self-extracting MS-DOS archive file\n",
+      "      lvq_pak-3.1.tar    UNIX tape archive file\n",
+      "      lvq_pak-3.1.tar.Z  same as above but compressed\n",
+      "\n",
+      "Installation in UNIX (in more detail, see lvq_doc.ps/txt):\n",
+      "  - Uncompress lvq_pak-3.1.tar.Z\n",
+      "  - Extract the files with \"tar xovf lvq_pak-3.1.tar\" which creates\n",
+      "    the subdirectory lvq_pak-3.1\n",
+      "  - Copy makefile.unix to the name makefile\n",
+      "  - Revise switches in the makefile, if necessary\n",
+      "  - Execute \"make\"\n",
+      "\n",
+      "Installation in MS-DOS (in more detail, see lvq_doc.ps/txt):\n",
+      "  - By executing the command lvq_p3r1 the self-extracting archive\n",
+      "    creates the directory lvq_pak.3r1 and extracts all the files in it\n",
+      "  - You are supposed to use Borland C++ Version 3.1 and to have\n",
+      "    all the necessary environment settings\n",
+      "  - Copy the file makefile.dos to the name makefile\n",
+      "  - Revise the compiler switches in the makefile, if necessary\n",
+      "  - Execute \"make\"\n",
+      "\n",
+      "Revision history:\n",
+      "  - Version 1.0 was released 19 December 1991.\n",
+      "  - Version 1.1 containing only a minor bug fix in memory allocation\n",
+      "    was released 31 December 1991.\n",
+      "  - Version 2.0 containing major modifications in the algorithms was\n",
+      "    released January 31, 1992.\n",
+      "  - Version 2.1 containing some improvements in the speed of algorithms\n",
+      "    and one new program was released October 9, 1992.\n",
+      "  - Version 3.0 containing many advanced features conserning application\n",
+      "    of the algorithms in large problems was released March 1, 1995; for\n",
+      "    these changes see documentation.\n",
+      "  - Version 3.1 containing only a bug fix in random ordering\n",
+      "    was released 7 April 1995.\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(dset.DESCR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(dset.LICENSE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dset.LICENSE = '''\n",
+    "************************************************************************\n",
+    "*                                                                      *\n",
+    "*                              LVQ_PAK                                 *\n",
+    "*                                                                      *\n",
+    "*                                The                                   *\n",
+    "*                                                                      *\n",
+    "*                   Learning  Vector  Quantization                     *\n",
+    "*                                                                      *\n",
+    "*                          Program  Package                            *\n",
+    "*                                                                      *\n",
+    "*                   Version 3.1 (April 7, 1995)                        *\n",
+    "*                                                                      *\n",
+    "*                          Prepared by the                             *\n",
+    "*                    LVQ Programming Team of the                       *\n",
+    "*                 Helsinki University of Technology                    *\n",
+    "*           Laboratory of Computer and Information Science             *\n",
+    "*                Rakentajanaukio 2 C, SF-02150 Espoo                   *\n",
+    "*                              FINLAND                                 *\n",
+    "*                                                                      *\n",
+    "*                      Copyright (c) 1991-1995                         *\n",
+    "*                                                                      *\n",
+    "************************************************************************\n",
+    "*                                                                      *\n",
+    "*  NOTE: This program package is copyrighted in the sense that it      *\n",
+    "*  may be used for scientific purposes. The package as a whole, or     *\n",
+    "*  parts thereof, cannot be included or used in any commercial         *\n",
+    "*  application without written permission granted by its producents.   *\n",
+    "*  No programs contained in this package may be copied for commercial  *\n",
+    "*  distribution.                                                       *\n",
+    "*                                                                      *\n",
+    "*  All comments concerning this program package may be sent to the     *\n",
+    "*  e-mail address 'lvq@nucleus.hut.fi'.                                *\n",
+    "*                                                                      *\n",
+    "************************************************************************\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "sklearn.utils.Bunch"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(dset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Adding a process_data method\n",
+    "The next step is to write the importer that actually processes the data into a usable format.\n",
+    "Usually, this gets added to `datasets.py`\n",
+    "\n",
+    "The important things to add are `data` and `target` entries. `metadata` is optional, but recommended if you want to do things like interactive visualizations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Unpack the file\n",
+    "untar_dir = fetch_and_unpack(dataset_name)\n",
+    "unpack_dir = untar_dir / 'lvq_pak-3.1'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "unpack_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_lvqpak_dat(filename):\n",
+    "    \"\"\"Read an LVQ-PQK formatted file\"\"\"\n",
+    "    with open(filename, 'r') as fd:\n",
+    "        df = pd.read_table(fd, skiprows=[0,1], skip_blank_lines=True, comment=None, header=None, sep=' ', dtype=str)\n",
+    "        # targets are last column. Data is everything else\n",
+    "        target = df.loc[:,df.columns[-1]].values\n",
+    "        data = df.loc[:,df.columns[:-1]].values\n",
+    "        return data, target"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from phoneme.data.datasets import new_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_lvq_pak(kind='all'):\n",
+    "    \"\"\"\n",
+    "    kind: {'test', 'train', 'all'}, default 'all'\n",
+    "    \"\"\"\n",
+    "    dset = new_dataset(dataset_name='lvq-pak')\n",
+    "    if kind == 'train':\n",
+    "        dset['data'], dset['target'] = read_lvqpak_dat(unpack_dir / 'ex1.dat')\n",
+    "    elif kind == 'test':\n",
+    "        dset['data'], dset['target'] = read_lvqpak_dat(unpack_dir / 'ex2.dat')\n",
+    "    elif kind == 'all':\n",
+    "        data, target = read_lvqpak_dat(unpack_dir / 'ex1.dat')\n",
+    "        data2, target2 = read_lvqpak_dat(unpack_dir / 'ex2.dat')\n",
+    "        dset['data'] = np.vstack((data, data2))\n",
+    "        dset['target'] = np.append(target, target2)\n",
+    "    else:\n",
+    "        raise Exception(f'Unknown kind: {kind}')\n",
+    "    \n",
+    "    return dset\n",
+    "        \n",
+    "        \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dset = process_lvq_pak(kind='all')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(3923, 20)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dset.data.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(3923,)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dset.target.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['21.47', '-19.90', '-20.68', '-6.73', '13.67', '-11.95', '13.83',\n",
+       "       '12.02', '7.62', '-6.15', '-4.38', '-2.91', '4.80', '-7.39',\n",
+       "       '-3.54', '-0.87', '-5.02', '-1.41', '-2.33', '2.12'], dtype=object)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dset.data[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'A'"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dset.target[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}