From d7b5148b989cfd4ca38a1e21aa6cac651486e048 Mon Sep 17 00:00:00 2001 From: Kjell Wooding Date: Wed, 11 Jul 2018 20:32:13 -0400 Subject: [PATCH 1/3] more helper functions for adding datasets --- .../data/datasets.py | 13 +++++++ .../data/utils.py | 34 +++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py index 7ad8367..53313cc 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py @@ -17,6 +17,7 @@ jlmem = Memory(cachedir=str(interim_data_path)) def new_dataset(*, dataset_name): + global dataset_raw_files dset = Bunch() dset['metadata'] = {} @@ -33,6 +34,18 @@ def new_dataset(*, dataset_name): return dset +def add_dataset_by_urllist(dataset_name, url_list): + """Add a new dataset by specifying a url_list + + url_list is a list of dicts keyed by: + * url, hash_type, hash_value, name, file_name + """ + global dataset_raw_files + + dataset_raw_files[dataset_name] = {'url_list': url_list} + write_dataset() + dataset_raw_files = read_datasets() + return dataset_raw_files[dataset_name] @jlmem.cache def load_dataset(dataset_name, return_X_y=False, **kwargs): diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py index 78c13c5..50cb6bd 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py @@ -77,6 +77,40 @@ def fetch_files(force=False, dst_dir=None, **kwargs): result_list.append(fetch_file(force=force, dst_dir=dst_dir, **url_dict)) return all([r[0] for r in result_list]), result_list +def fetch_text_file(url, file_name=None, dst_dir=None, force=True, **kwargs): + """Fetch a text file (via URL) and return it as a string. + + Arguments + --------- + + file_name: + output file name. If not specified, use the last + component of the URL + dst_dir: + directory to place downloaded files + force: boolean + normally, the URL is only downloaded if `file_name` is + not present on the filesystem, or if the existing file has a + bad hash. If force is True, download is always attempted. + + In addition to these options, any of `fetch_file`'s keywords may + also be passed + + Returns + ------- + fetched string, or None if something went wrong with the download + """ + retlist = fetch_file(url, file_name=file_name, dst_dir=dst_dir, + force=force, **kwargs) + if retlist[0]: + status, filename, hashval = retlist + with open(filename, 'r') as txt: + return txt.read() + else: + logger.warning(f'fetch of {url} failed with status: {retlist[0]}') + return None + + def fetch_file(url, file_name=None, dst_dir=None, force=False, From b7d4009504fe7d55bc2cd7a59a3ca2a9f0032d0f Mon Sep 17 00:00:00 2001 From: Kjell Wooding Date: Thu, 12 Jul 2018 05:24:16 -0400 Subject: [PATCH 2/3] unpack plain tarfiles too --- .../{{ cookiecutter.module_name }}/data/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py index 50cb6bd..7fae0cc 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py @@ -220,6 +220,9 @@ def unpack(filename, dst_dir=None, create_dst=True): elif path.endswith('.tar.bz2') or path.endswith('.tbz'): archive = True opener, mode = tarfile.open, 'r:bz2' + elif path.endswith('.tar'): + archive = True + opener, mode = tarfile.open, 'r' elif path.endswith('.gz'): opener, mode = gzip.open, 'rb' outfile, outmode = path[:-3], 'wb' From 84669eff72c5d63236d2d2e859a5506aa5eac4a1 Mon Sep 17 00:00:00 2001 From: Kjell Wooding Date: Thu, 12 Jul 2018 05:26:14 -0400 Subject: [PATCH 3/3] add an example dataset (and pull in pandas) --- {{ cookiecutter.repo_name }}/environment.yml | 1 + .../notebooks/00-add-example-dataset.ipynb | 706 ++++++++++++++++++ 2 files changed, 707 insertions(+) create mode 100644 {{ cookiecutter.repo_name }}/notebooks/00-add-example-dataset.ipynb diff --git a/{{ cookiecutter.repo_name }}/environment.yml b/{{ cookiecutter.repo_name }}/environment.yml index e06b4c2..540e40c 100644 --- a/{{ cookiecutter.repo_name }}/environment.yml +++ b/{{ cookiecutter.repo_name }}/environment.yml @@ -24,6 +24,7 @@ dependencies: - scikit-learn - joblib - nb_conda + - pandas {% if cookiecutter.python_interpreter == 'python3' -%} {{ py3dep()|indent(2, true) }} {% else -%} diff --git a/{{ cookiecutter.repo_name }}/notebooks/00-add-example-dataset.ipynb b/{{ cookiecutter.repo_name }}/notebooks/00-add-example-dataset.ipynb new file mode 100644 index 0000000..688374b --- /dev/null +++ b/{{ cookiecutter.repo_name }}/notebooks/00-add-example-dataset.ipynb @@ -0,0 +1,706 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from phoneme.data import datasets\n", + "from phoneme.data.utils import build_dataset_dict, hash_file, fetch_and_unpack, fetch_text_file\n", + "from phoneme.paths import raw_data_path\n", + "from functools import partial\n", + "from phoneme.paths import interim_data_path, raw_data_path\n", + "\n", + "from pprint import pprint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding the LVQ-PAK Finnish Phonetic dataset\n", + "\n", + "The Learning Vector Quantization project includes a simple Finnish phonetic dataset\n", + "consisting (presumably) of 20-dimensional " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "dataset_name='lvq-pak'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "************************************************************************\n", + "* *\n", + "* LVQ_PAK *\n", + "* *\n", + "* The *\n", + "* *\n", + "* Learning Vector Quantization *\n", + "* *\n", + "* Program Package *\n", + "* *\n", + "* Version 3.1 (April 7, 1995) *\n", + "* *\n", + "* Prepared by the *\n", + "* LVQ Programming Team of the *\n", + "* Helsinki University of Technology *\n", + "* Laboratory of Computer and Information Science *\n", + "* Rakentajanaukio 2 C, SF-02150 Espoo *\n", + "* FINLAND *\n", + "* *\n", + "* Copyright (c) 1991-1995 *\n", + "* *\n", + "************************************************************************\n", + "* *\n", + "* NOTE: This program package is copyrighted in the sense that it *\n", + "* may be used for scientific purposes. The package as a whole, or *\n", + "* parts thereof, cannot be included or used in any commercial *\n", + "* application without written permission granted by its producents. *\n", + "* No programs contained in this package may be copied for commercial *\n", + "* distribution. *\n", + "* *\n", + "* All comments concerning this program package may be sent to the *\n", + "* e-mail address 'lvq@nucleus.hut.fi'. *\n", + "* *\n", + "************************************************************************\n", + "\n", + "This package contains all the programs necessary for the correct\n", + "application of certain LVQ (Learning Vector Quantization) algorithms\n", + "in an arbitrary statistical classification or pattern recognition\n", + "task. To this package four options for the algorithms, the\n", + "LVQ1, the LVQ2.1, the LVQ3 and the OLVQ1, have been selected. \n", + "\n", + "In the implementation of the LVQ programs we have tried to use as\n", + "simple a code as possible. Therefore the programs are supposed to\n", + "compile in various machines without any specific modifications made on\n", + "the code. All programs have been written in ANSI C.\n", + "\n", + "The lvq_pak program package includes the following files:\n", + " - Documentation:\n", + " README this file\n", + " lvq_doc.ps documentation in (c) PostScript format\n", + " lvq_doc.ps.Z same as above but compressed\n", + " lvq_doc.txt documentation in ASCII format\n", + " - Source file archives:\n", + " lvq_p3r1.exe Self-extracting MS-DOS archive file\n", + " lvq_pak-3.1.tar UNIX tape archive file\n", + " lvq_pak-3.1.tar.Z same as above but compressed\n", + "\n", + "Installation in UNIX (in more detail, see lvq_doc.ps/txt):\n", + " - Uncompress lvq_pak-3.1.tar.Z\n", + " - Extract the files with \"tar xovf lvq_pak-3.1.tar\" which creates\n", + " the subdirectory lvq_pak-3.1\n", + " - Copy makefile.unix to the name makefile\n", + " - Revise switches in the makefile, if necessary\n", + " - Execute \"make\"\n", + "\n", + "Installation in MS-DOS (in more detail, see lvq_doc.ps/txt):\n", + " - By executing the command lvq_p3r1 the self-extracting archive\n", + " creates the directory lvq_pak.3r1 and extracts all the files in it\n", + " - You are supposed to use Borland C++ Version 3.1 and to have\n", + " all the necessary environment settings\n", + " - Copy the file makefile.dos to the name makefile\n", + " - Revise the compiler switches in the makefile, if necessary\n", + " - Execute \"make\"\n", + "\n", + "Revision history:\n", + " - Version 1.0 was released 19 December 1991.\n", + " - Version 1.1 containing only a minor bug fix in memory allocation\n", + " was released 31 December 1991.\n", + " - Version 2.0 containing major modifications in the algorithms was\n", + " released January 31, 1992.\n", + " - Version 2.1 containing some improvements in the speed of algorithms\n", + " and one new program was released October 9, 1992.\n", + " - Version 3.0 containing many advanced features conserning application\n", + " of the algorithms in large problems was released March 1, 1995; for\n", + " these changes see documentation.\n", + " - Version 3.1 containing only a bug fix in random ordering\n", + " was released 7 April 1995.\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(fetch_text_file('http://www.cis.hut.fi/research/lvq_pak/README', file_name=f'{dataset_name}.readme'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Librispeech includes hashes on their website, so we should download them and have a look." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Grab the source code package\n", + "lvq_pak = build_dataset_dict(url=\"http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you download files with names DESCR or LICENSE, they will be used as the description and license text respectively.\n", + "Usually you will want to give these unique names, so they don't clash with other downloaded files. (e.g. \"LICENSE.txt\" is a terrible name to use)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "descr = build_dataset_dict(url='http://www.cis.hut.fi/research/lvq_pak/README', file_name=f'{dataset_name}.readme',\n", + " name='DESCR')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 38518594\r\n", + "drwx------@ 1 kjell staff 16384 12 Jul 04:01 \u001b[34m.\u001b[m\u001b[m\r\n", + "drwx------ 1 kjell staff 16384 10 Jul 10:35 \u001b[34m..\u001b[m\u001b[m\r\n", + "-rwx------@ 1 kjell staff 6148 10 Jul 09:31 \u001b[31m.DS_Store\u001b[m\u001b[m\r\n", + "-rwx------ 1 kjell staff 3769022 12 Jul 03:52 \u001b[31matt_faces.zip\u001b[m\u001b[m\r\n", + "-rwx------ 1 kjell staff 337926286 11 Jul 10:00 \u001b[31mdev-clean.tar.gz\u001b[m\u001b[m\r\n", + "-rwx------ 1 kjell staff 161980953 11 Jul 08:48 \u001b[31mfsew0_v1.1.tar.gz\u001b[m\u001b[m\r\n", + "-rwx------@ 1 kjell staff 18859619587 10 Jul 06:27 \u001b[31mhm2014-data.zip\u001b[m\u001b[m\r\n", + "-rwx------ 1 kjell staff 600 11 Jul 20:22 \u001b[31mlibrispeech.hashes\u001b[m\u001b[m\r\n", + "-rwx------ 1 kjell staff 18652 11 Jul 10:06 \u001b[31mlibrispeech.license\u001b[m\u001b[m\r\n", + "-rwx------ 1 kjell staff 4958 12 Jul 05:24 \u001b[31mlvq-pak.readme\u001b[m\u001b[m\r\n", + "-rwx------ 1 kjell staff 747520 12 Jul 04:01 \u001b[31mlvq_pak-3.1.tar\u001b[m\u001b[m\r\n", + "-rwx------ 1 kjell staff 8448 10 Jul 10:17 \u001b[31mmocha-timit.descr\u001b[m\u001b[m\r\n", + "-rwx------ 1 kjell staff 2130 10 Jul 10:17 \u001b[31mmocha-timit.license\u001b[m\u001b[m\r\n", + "-rwx------ 1 kjell staff 196239360 11 Jul 08:46 \u001b[31mmsak0_v1.1.tar\u001b[m\u001b[m\r\n", + "-rwx------ 1 kjell staff 161159737 11 Jul 08:46 \u001b[31mmsak0_v1.1.tar.gz\u001b[m\u001b[m\r\n" + ] + } + ], + "source": [ + "# notice the files have been downloaded to the RAW directory\n", + "!ls -la ../data/raw" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'url': 'http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar',\n", + " 'hash_type': 'sha1',\n", + " 'hash_value': '86024a871724e521341da0ffb783956e39aadb6e',\n", + " 'name': None,\n", + " 'file_name': None},\n", + " {'url': 'http://www.cis.hut.fi/research/lvq_pak/README',\n", + " 'hash_type': 'sha1',\n", + " 'hash_value': '138b69cc0b4e02950cec5833752e50a54d36fd0f',\n", + " 'name': 'DESCR',\n", + " 'file_name': 'lvq-pak.readme'}]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This is what the generated URL list looks like\n", + "url_list = [lvq_pak, descr]\n", + "url_list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add the url list to the dataset dictionary\n", + "Eventually, you will need to add a function to process the data.\n", + "If you don't specify one, a generic function is used that just sets the\n", + "LICENSE and DESCR text if possible\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'load_function': functools.partial(, dataset_name='lvq-pak'),\n", + " 'load_function_name': 'new_dataset',\n", + " 'load_function_options': {'dataset_name': 'lvq-pak'},\n", + " 'url_list': [{'file_name': None,\n", + " 'hash_type': 'sha1',\n", + " 'hash_value': '86024a871724e521341da0ffb783956e39aadb6e',\n", + " 'name': None,\n", + " 'url': 'http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar'},\n", + " {'file_name': 'lvq-pak.readme',\n", + " 'hash_type': 'sha1',\n", + " 'hash_value': '138b69cc0b4e02950cec5833752e50a54d36fd0f',\n", + " 'name': 'DESCR',\n", + " 'url': 'http://www.cis.hut.fi/research/lvq_pak/README'}]}\n" + ] + } + ], + "source": [ + "# Add this key into the (internal) dataset dictionary\n", + "newds_dict = datasets.add_dataset_by_urllist(dataset_name, url_list)\n", + "pprint(newds_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Now, call the (generic) load function and notice that the LICENSE and DESCR have been set\n", + "dset = newds_dict['load_function']()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "************************************************************************\n", + "* *\n", + "* LVQ_PAK *\n", + "* *\n", + "* The *\n", + "* *\n", + "* Learning Vector Quantization *\n", + "* *\n", + "* Program Package *\n", + "* *\n", + "* Version 3.1 (April 7, 1995) *\n", + "* *\n", + "* Prepared by the *\n", + "* LVQ Programming Team of the *\n", + "* Helsinki University of Technology *\n", + "* Laboratory of Computer and Information Science *\n", + "* Rakentajanaukio 2 C, SF-02150 Espoo *\n", + "* FINLAND *\n", + "* *\n", + "* Copyright (c) 1991-1995 *\n", + "* *\n", + "************************************************************************\n", + "* *\n", + "* NOTE: This program package is copyrighted in the sense that it *\n", + "* may be used for scientific purposes. The package as a whole, or *\n", + "* parts thereof, cannot be included or used in any commercial *\n", + "* application without written permission granted by its producents. *\n", + "* No programs contained in this package may be copied for commercial *\n", + "* distribution. *\n", + "* *\n", + "* All comments concerning this program package may be sent to the *\n", + "* e-mail address 'lvq@nucleus.hut.fi'. *\n", + "* *\n", + "************************************************************************\n", + "\n", + "This package contains all the programs necessary for the correct\n", + "application of certain LVQ (Learning Vector Quantization) algorithms\n", + "in an arbitrary statistical classification or pattern recognition\n", + "task. To this package four options for the algorithms, the\n", + "LVQ1, the LVQ2.1, the LVQ3 and the OLVQ1, have been selected. \n", + "\n", + "In the implementation of the LVQ programs we have tried to use as\n", + "simple a code as possible. Therefore the programs are supposed to\n", + "compile in various machines without any specific modifications made on\n", + "the code. All programs have been written in ANSI C.\n", + "\n", + "The lvq_pak program package includes the following files:\n", + " - Documentation:\n", + " README this file\n", + " lvq_doc.ps documentation in (c) PostScript format\n", + " lvq_doc.ps.Z same as above but compressed\n", + " lvq_doc.txt documentation in ASCII format\n", + " - Source file archives:\n", + " lvq_p3r1.exe Self-extracting MS-DOS archive file\n", + " lvq_pak-3.1.tar UNIX tape archive file\n", + " lvq_pak-3.1.tar.Z same as above but compressed\n", + "\n", + "Installation in UNIX (in more detail, see lvq_doc.ps/txt):\n", + " - Uncompress lvq_pak-3.1.tar.Z\n", + " - Extract the files with \"tar xovf lvq_pak-3.1.tar\" which creates\n", + " the subdirectory lvq_pak-3.1\n", + " - Copy makefile.unix to the name makefile\n", + " - Revise switches in the makefile, if necessary\n", + " - Execute \"make\"\n", + "\n", + "Installation in MS-DOS (in more detail, see lvq_doc.ps/txt):\n", + " - By executing the command lvq_p3r1 the self-extracting archive\n", + " creates the directory lvq_pak.3r1 and extracts all the files in it\n", + " - You are supposed to use Borland C++ Version 3.1 and to have\n", + " all the necessary environment settings\n", + " - Copy the file makefile.dos to the name makefile\n", + " - Revise the compiler switches in the makefile, if necessary\n", + " - Execute \"make\"\n", + "\n", + "Revision history:\n", + " - Version 1.0 was released 19 December 1991.\n", + " - Version 1.1 containing only a minor bug fix in memory allocation\n", + " was released 31 December 1991.\n", + " - Version 2.0 containing major modifications in the algorithms was\n", + " released January 31, 1992.\n", + " - Version 2.1 containing some improvements in the speed of algorithms\n", + " and one new program was released October 9, 1992.\n", + " - Version 3.0 containing many advanced features conserning application\n", + " of the algorithms in large problems was released March 1, 1995; for\n", + " these changes see documentation.\n", + " - Version 3.1 containing only a bug fix in random ordering\n", + " was released 7 April 1995.\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(dset.DESCR)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n" + ] + } + ], + "source": [ + "print(dset.LICENSE)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "dset.LICENSE = '''\n", + "************************************************************************\n", + "* *\n", + "* LVQ_PAK *\n", + "* *\n", + "* The *\n", + "* *\n", + "* Learning Vector Quantization *\n", + "* *\n", + "* Program Package *\n", + "* *\n", + "* Version 3.1 (April 7, 1995) *\n", + "* *\n", + "* Prepared by the *\n", + "* LVQ Programming Team of the *\n", + "* Helsinki University of Technology *\n", + "* Laboratory of Computer and Information Science *\n", + "* Rakentajanaukio 2 C, SF-02150 Espoo *\n", + "* FINLAND *\n", + "* *\n", + "* Copyright (c) 1991-1995 *\n", + "* *\n", + "************************************************************************\n", + "* *\n", + "* NOTE: This program package is copyrighted in the sense that it *\n", + "* may be used for scientific purposes. The package as a whole, or *\n", + "* parts thereof, cannot be included or used in any commercial *\n", + "* application without written permission granted by its producents. *\n", + "* No programs contained in this package may be copied for commercial *\n", + "* distribution. *\n", + "* *\n", + "* All comments concerning this program package may be sent to the *\n", + "* e-mail address 'lvq@nucleus.hut.fi'. *\n", + "* *\n", + "************************************************************************\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sklearn.utils.Bunch" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(dset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding a process_data method\n", + "The next step is to write the importer that actually processes the data into a usable format.\n", + "Usually, this gets added to `datasets.py`\n", + "\n", + "The important things to add are `data` and `target` entries. `metadata` is optional, but recommended if you want to do things like interactive visualizations" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Unpack the file\n", + "untar_dir = fetch_and_unpack(dataset_name)\n", + "unpack_dir = untar_dir / 'lvq_pak-3.1'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unpack_dir.exists()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def read_lvqpak_dat(filename):\n", + " \"\"\"Read an LVQ-PQK formatted file\"\"\"\n", + " with open(filename, 'r') as fd:\n", + " df = pd.read_table(fd, skiprows=[0,1], skip_blank_lines=True, comment=None, header=None, sep=' ', dtype=str)\n", + " # targets are last column. Data is everything else\n", + " target = df.loc[:,df.columns[-1]].values\n", + " data = df.loc[:,df.columns[:-1]].values\n", + " return data, target" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from phoneme.data.datasets import new_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "def process_lvq_pak(kind='all'):\n", + " \"\"\"\n", + " kind: {'test', 'train', 'all'}, default 'all'\n", + " \"\"\"\n", + " dset = new_dataset(dataset_name='lvq-pak')\n", + " if kind == 'train':\n", + " dset['data'], dset['target'] = read_lvqpak_dat(unpack_dir / 'ex1.dat')\n", + " elif kind == 'test':\n", + " dset['data'], dset['target'] = read_lvqpak_dat(unpack_dir / 'ex2.dat')\n", + " elif kind == 'all':\n", + " data, target = read_lvqpak_dat(unpack_dir / 'ex1.dat')\n", + " data2, target2 = read_lvqpak_dat(unpack_dir / 'ex2.dat')\n", + " dset['data'] = np.vstack((data, data2))\n", + " dset['target'] = np.append(target, target2)\n", + " else:\n", + " raise Exception(f'Unknown kind: {kind}')\n", + " \n", + " return dset\n", + " \n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "dset = process_lvq_pak(kind='all')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3923, 20)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dset.data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3923,)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dset.target.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['21.47', '-19.90', '-20.68', '-6.73', '13.67', '-11.95', '13.83',\n", + " '12.02', '7.62', '-6.15', '-4.38', '-2.91', '4.80', '-7.39',\n", + " '-3.54', '-0.87', '-5.02', '-1.41', '-2.33', '2.12'], dtype=object)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dset.data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'A'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dset.target[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}