diff --git a/tutorials/setup_template/.env.example b/tutorials/setup_template/.env.example new file mode 100644 index 0000000..dc1a894 --- /dev/null +++ b/tutorials/setup_template/.env.example @@ -0,0 +1,3 @@ +MLFLOW_TRACKING_URL=./mlruns +DATA_CACHE_DIR=./.hf_cache + diff --git a/tutorials/setup_template/.gitignore b/tutorials/setup_template/.gitignore new file mode 100644 index 0000000..c0c5fc5 --- /dev/null +++ b/tutorials/setup_template/.gitignore @@ -0,0 +1,16 @@ +# Environment +.env +nlp311/ + +# Runtime artifacts +data/ +mlruns/ +reports/ +.hf_cache/ + +# Python +__pycache__/ +*.pyc + +# OS +.DS_Store diff --git a/tutorials/setup_template/configs/baseline.yaml b/tutorials/setup_template/configs/baseline.yaml new file mode 100644 index 0000000..21f8066 --- /dev/null +++ b/tutorials/setup_template/configs/baseline.yaml @@ -0,0 +1,14 @@ +experiment_name: "t0_setup_template" +dataset: "ag_news" +test_size: 0.2 +random_state: 42 +tfidf: + max_features: 30000 + ngram_range: [1,2] +model: + type: "logreg" + C: 2.0 + max_iter: 200 +metrics: + average: "macro" + diff --git a/tutorials/setup_template/notebooks/01_setup_and_eda.ipynb b/tutorials/setup_template/notebooks/01_setup_and_eda.ipynb new file mode 100644 index 0000000..3f471cb --- /dev/null +++ b/tutorials/setup_template/notebooks/01_setup_and_eda.ipynb @@ -0,0 +1,1067 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook 1: Setup & Exploratory Data Analysis\n", + "\n", + "Welcome to the **Hack for LA Data Science** setup template tutorial!\n", + "\n", + "This is the first of three notebooks that walk you through a complete NLP (Natural Language Processing) text classification pipeline. By the end of this series, you'll have trained a model that can read a news headline and predict its category.\n", + "\n", + "**What you'll do in this notebook:**\n", + "\n", + "1. Set up your Python environment and install dependencies\n", + "2. Load a real-world news dataset from Hugging Face\n", + "3. Explore the data to understand what we're working with\n", + "\n", + "**Why this matters:** Before building any model, you need to understand your data. Exploratory Data Analysis (EDA) helps you catch problems early — things like imbalanced classes, unexpected text lengths, or missing values — that would otherwise silently hurt your model's performance.\n", + "\n", + "---\n", + "\n", + "### Notebook series\n", + "\n", + "| Notebook | Focus |\n", + "|---|---|\n", + "| **01 — Setup & EDA** (you are here) | Environment, data loading, exploration |\n", + "| 02 — Train & Evaluate | Build, train, and evaluate a text classifier |\n", + "| 03 — Serve & Predict | Load the trained model and make predictions |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 1. Environment Setup\n", + "\n", + "Before writing any code, we need to make sure our tools are installed and configured. Think of this like setting up your workbench before starting a project.\n", + "\n", + "### What is a virtual environment?\n", + "\n", + "A **virtual environment** is an isolated Python installation. Most operating systems come with a version of Python already installed: if you open a terminal or a powershell and type `python` you'll see something like the following:\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "Image borrowed from [tutorialsteacher](https://www.tutorialsteacher.com/python/python-interective-shell)\n", + "\n", + "Because there's a lot of variability in dependencies and versions, it's standard practice to use a virtual environment to keep your project's packages separate from your system Python, so you won't accidentally break other projects (or vice versa). Every serious Python project should use one.\n", + "\n", + "### Prerequisites\n", + "- **Python 3.11+** installed on your system\n", + "- This notebook should be run from the `notebooks/` directory inside the project" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1a. Locate the Project Root\n", + "\n", + "Now we need to determine the root of the project we're setting everything up in. This matters because we need to find files like `requirements.txt` and `configs/baseline.yaml` regardless of where you opened the notebook from." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project root: /Users/Bartley/Documents/personal_dev/h4la/repos/data-science/tutorials/setup_template\n" + ] + } + ], + "source": [ + "import subprocess, sys, os\n", + "from pathlib import Path\n", + "\n", + "# Try to detect the project root automatically.\n", + "# If running in VS Code, __vsc_ipynb_file__ gives us the notebook's path.\n", + "# Otherwise, we assume the notebook is in a 'notebooks/' subdirectory.\n", + "PROJECT_ROOT = (\n", + " Path(\"__vsc_ipynb_file__\").resolve().parent.parent\n", + " if \"__vsc_ipynb_file__\" in dir()\n", + " else Path.cwd().parent\n", + ")\n", + "\n", + "# Safety check: if we can't find requirements.txt, we're probably already at the root\n", + "if not (PROJECT_ROOT / \"requirements.txt\").exists():\n", + " PROJECT_ROOT = Path.cwd()\n", + "\n", + "print(f\"Project root: {PROJECT_ROOT}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1b. Create the Virtual Environment\n", + "\n", + "This creates a virtual environment called `nlp311` in the project root. If it already exists, the cell simply skips creation.\n", + "\n", + "> **Note:** Creating the venv from inside the notebook is convenient for this tutorial, but in day-to-day work you'd typically do this once from the terminal." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Virtual environment already exists at /Users/Bartley/Documents/personal_dev/h4la/repos/data-science/tutorials/setup_template/nlp311\n", + "\n", + "To activate in a terminal, run:\n", + " source /Users/Bartley/Documents/personal_dev/h4la/repos/data-science/tutorials/setup_template/nlp311/bin/activate\n" + ] + } + ], + "source": [ + "venv_path = PROJECT_ROOT / \"nlp311\"\n", + "\n", + "if not venv_path.exists():\n", + " # this is calling python -m venv venv_path\n", + " subprocess.check_call([sys.executable, \"-m\", \"venv\", str(venv_path)])\n", + " print(f\"Created virtual environment at {venv_path}\")\n", + "else:\n", + " print(f\"Virtual environment already exists at {venv_path}\")\n", + "\n", + "# Show the activation command for terminal use\n", + "if sys.platform == \"win32\":\n", + " activate_cmd = str(venv_path / \"Scripts\" / \"activate\")\n", + "else:\n", + " activate_cmd = f\"source {venv_path / 'bin' / 'activate'}\"\n", + "\n", + "print(f\"\\nTo activate in a terminal, run:\\n {activate_cmd}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1c. Install Dependencies\n", + "\n", + "The `requirements.txt` file lists every Python package this project needs, along with specific version numbers. Pinning versions ensures everyone on the team gets the same behavior — no \"works on my machine\" surprises.\n", + "\n", + "Key packages we're installing:\n", + "- **pandas** — tabular data manipulation (think Excel spreadsheets in Python)\n", + "- **scikit-learn** — classic machine learning algorithms and utilities\n", + "- **datasets** — Hugging Face's library for downloading and managing datasets\n", + "- **mlflow** — experiment tracking (logs your model runs so you can compare them later)\n", + "- **matplotlib** — plotting and visualization\n", + "- **fastapi / uvicorn** — for serving predictions as a web API (used in notebook 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pandas==2.2.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from -r ../requirements.txt (line 1)) (2.2.2)\n", + "Requirement already satisfied: scikit-learn==1.5.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from -r ../requirements.txt (line 2)) (1.5.2)\n", + "Requirement already satisfied: spacy==3.7.6 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from -r ../requirements.txt (line 3)) (3.7.6)\n", + "Requirement already satisfied: matplotlib==3.9.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from -r ../requirements.txt (line 4)) (3.9.2)\n", + "Requirement already satisfied: datasets==3.0.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from -r ../requirements.txt (line 5)) (3.0.1)\n", + "Requirement already satisfied: transformers==4.44.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from -r ../requirements.txt (line 6)) (4.44.2)\n", + "Requirement already satisfied: mlflow==2.16.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from -r ../requirements.txt (line 7)) (2.16.2)\n", + "Requirement already satisfied: numpy==1.26.4 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from -r ../requirements.txt (line 8)) (1.26.4)\n", + "Requirement already satisfied: pyyaml==6.0.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from -r ../requirements.txt (line 9)) (6.0.2)\n", + "Requirement already satisfied: python-dotenv==1.0.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from -r ../requirements.txt (line 10)) (1.0.1)\n", + "Requirement already satisfied: pydantic==2.9.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from -r ../requirements.txt (line 11)) (2.9.2)\n", + "Requirement already satisfied: fastapi==0.115.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from -r ../requirements.txt (line 12)) (0.115.0)\n", + "Requirement already satisfied: uvicorn==0.30.6 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from -r ../requirements.txt (line 13)) (0.30.6)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from pandas==2.2.2->-r ../requirements.txt (line 1)) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from pandas==2.2.2->-r ../requirements.txt (line 1)) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from pandas==2.2.2->-r ../requirements.txt (line 1)) (2025.3)\n", + "Requirement already satisfied: scipy>=1.6.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from scikit-learn==1.5.2->-r ../requirements.txt (line 2)) (1.16.3)\n", + "Requirement already satisfied: joblib>=1.2.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from scikit-learn==1.5.2->-r ../requirements.txt (line 2)) (1.5.3)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from scikit-learn==1.5.2->-r ../requirements.txt (line 2)) (3.6.0)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (3.0.12)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (1.0.5)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (1.0.15)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (2.0.13)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (3.0.12)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (8.2.5)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (1.1.3)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (2.5.2)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (2.0.10)\n", + "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (0.4.3)\n", + "Requirement already satisfied: typer<1.0.0,>=0.3.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (0.23.2)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (4.67.1)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (2.32.3)\n", + "Requirement already satisfied: jinja2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (3.1.6)\n", + "Requirement already satisfied: setuptools in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (80.1.0)\n", + "Requirement already satisfied: packaging>=20.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (24.2)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from spacy==3.7.6->-r ../requirements.txt (line 3)) (3.5.1)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from pydantic==2.9.2->-r ../requirements.txt (line 11)) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.23.4 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from pydantic==2.9.2->-r ../requirements.txt (line 11)) (2.23.4)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from pydantic==2.9.2->-r ../requirements.txt (line 11)) (4.15.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from matplotlib==3.9.2->-r ../requirements.txt (line 4)) (1.3.3)\n", + "Requirement already satisfied: cycler>=0.10 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from matplotlib==3.9.2->-r ../requirements.txt (line 4)) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from matplotlib==3.9.2->-r ../requirements.txt (line 4)) (4.61.1)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from matplotlib==3.9.2->-r ../requirements.txt (line 4)) (1.4.9)\n", + "Requirement already satisfied: pillow>=8 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from matplotlib==3.9.2->-r ../requirements.txt (line 4)) (12.1.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from matplotlib==3.9.2->-r ../requirements.txt (line 4)) (3.3.2)\n", + "Requirement already satisfied: filelock in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from datasets==3.0.1->-r ../requirements.txt (line 5)) (3.20.3)\n", + "Requirement already satisfied: pyarrow>=15.0.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from datasets==3.0.1->-r ../requirements.txt (line 5)) (17.0.0)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from datasets==3.0.1->-r ../requirements.txt (line 5)) (0.3.8)\n", + "Requirement already satisfied: xxhash in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from datasets==3.0.1->-r ../requirements.txt (line 5)) (3.6.0)\n", + "Requirement already satisfied: multiprocess in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from datasets==3.0.1->-r ../requirements.txt (line 5)) (0.70.16)\n", + "Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets==3.0.1->-r ../requirements.txt (line 5)) (2024.6.1)\n", + "Requirement already satisfied: aiohttp in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from datasets==3.0.1->-r ../requirements.txt (line 5)) (3.13.3)\n", + "Requirement already satisfied: huggingface-hub>=0.22.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from datasets==3.0.1->-r ../requirements.txt (line 5)) (0.36.2)\n", + "Requirement already satisfied: regex!=2019.12.17 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from transformers==4.44.2->-r ../requirements.txt (line 6)) (2025.11.3)\n", + "Requirement already satisfied: safetensors>=0.4.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from transformers==4.44.2->-r ../requirements.txt (line 6)) (0.7.0)\n", + "Requirement already satisfied: tokenizers<0.20,>=0.19 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from transformers==4.44.2->-r ../requirements.txt (line 6)) (0.19.1)\n", + "Requirement already satisfied: mlflow-skinny==2.16.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow==2.16.2->-r ../requirements.txt (line 7)) (2.16.2)\n", + "Requirement already satisfied: Flask<4 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow==2.16.2->-r ../requirements.txt (line 7)) (3.1.2)\n", + "Requirement already satisfied: alembic!=1.10.0,<2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow==2.16.2->-r ../requirements.txt (line 7)) (1.18.4)\n", + "Requirement already satisfied: docker<8,>=4.0.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow==2.16.2->-r ../requirements.txt (line 7)) (7.1.0)\n", + "Requirement already satisfied: graphene<4 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow==2.16.2->-r ../requirements.txt (line 7)) (3.4.3)\n", + "Requirement already satisfied: markdown<4,>=3.3 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow==2.16.2->-r ../requirements.txt (line 7)) (3.10)\n", + "Requirement already satisfied: sqlalchemy<3,>=1.4.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow==2.16.2->-r ../requirements.txt (line 7)) (2.0.46)\n", + "Requirement already satisfied: gunicorn<24 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow==2.16.2->-r ../requirements.txt (line 7)) (23.0.0)\n", + "Requirement already satisfied: starlette<0.39.0,>=0.37.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from fastapi==0.115.0->-r ../requirements.txt (line 12)) (0.38.6)\n", + "Requirement already satisfied: click>=7.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from uvicorn==0.30.6->-r ../requirements.txt (line 13)) (8.3.1)\n", + "Requirement already satisfied: h11>=0.8 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from uvicorn==0.30.6->-r ../requirements.txt (line 13)) (0.16.0)\n", + "Requirement already satisfied: cachetools<6,>=5.0.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (5.5.2)\n", + "Requirement already satisfied: cloudpickle<4 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (3.1.2)\n", + "Requirement already satisfied: databricks-sdk<1,>=0.20.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (0.88.0)\n", + "Requirement already satisfied: gitpython<4,>=3.1.9 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (3.1.46)\n", + "Requirement already satisfied: importlib-metadata!=4.7.0,<9,>=3.7.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (8.7.1)\n", + "Requirement already satisfied: opentelemetry-api<3,>=1.9.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (1.39.1)\n", + "Requirement already satisfied: opentelemetry-sdk<3,>=1.9.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (1.39.1)\n", + "Requirement already satisfied: protobuf<6,>=3.12.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (5.29.6)\n", + "Requirement already satisfied: sqlparse<1,>=0.4.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (0.5.5)\n", + "Requirement already satisfied: Mako in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from alembic!=1.10.0,<2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (1.3.10)\n", + "Requirement already satisfied: google-auth~=2.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from databricks-sdk<1,>=0.20.0->mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (2.48.0)\n", + "Requirement already satisfied: urllib3>=1.26.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from docker<8,>=4.0.0->mlflow==2.16.2->-r ../requirements.txt (line 7)) (2.4.0)\n", + "Requirement already satisfied: blinker>=1.9.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from Flask<4->mlflow==2.16.2->-r ../requirements.txt (line 7)) (1.9.0)\n", + "Requirement already satisfied: itsdangerous>=2.2.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from Flask<4->mlflow==2.16.2->-r ../requirements.txt (line 7)) (2.2.0)\n", + "Requirement already satisfied: markupsafe>=2.1.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from Flask<4->mlflow==2.16.2->-r ../requirements.txt (line 7)) (3.0.3)\n", + "Requirement already satisfied: werkzeug>=3.1.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from Flask<4->mlflow==2.16.2->-r ../requirements.txt (line 7)) (3.1.5)\n", + "Requirement already satisfied: gitdb<5,>=4.0.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from gitpython<4,>=3.1.9->mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (4.0.12)\n", + "Requirement already satisfied: smmap<6,>=3.0.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from gitdb<5,>=4.0.1->gitpython<4,>=3.1.9->mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (5.0.2)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from google-auth~=2.0->databricks-sdk<1,>=0.20.0->mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (0.4.2)\n", + "Requirement already satisfied: cryptography>=38.0.3 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from google-auth~=2.0->databricks-sdk<1,>=0.20.0->mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (46.0.4)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from google-auth~=2.0->databricks-sdk<1,>=0.20.0->mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (4.9.1)\n", + "Requirement already satisfied: graphql-core<3.3,>=3.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from graphene<4->mlflow==2.16.2->-r ../requirements.txt (line 7)) (3.2.7)\n", + "Requirement already satisfied: graphql-relay<3.3,>=3.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from graphene<4->mlflow==2.16.2->-r ../requirements.txt (line 7)) (3.2.0)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from huggingface-hub>=0.22.0->datasets==3.0.1->-r ../requirements.txt (line 5)) (1.2.0)\n", + "Requirement already satisfied: zipp>=3.20 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from importlib-metadata!=4.7.0,<9,>=3.7.0->mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (3.23.0)\n", + "Requirement already satisfied: opentelemetry-semantic-conventions==0.60b1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from opentelemetry-sdk<3,>=1.9.0->mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (0.60b1)\n", + "Requirement already satisfied: six>=1.5 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas==2.2.2->-r ../requirements.txt (line 1)) (1.17.0)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from requests<3.0.0,>=2.13.0->spacy==3.7.6->-r ../requirements.txt (line 3)) (3.4.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from requests<3.0.0,>=2.13.0->spacy==3.7.6->-r ../requirements.txt (line 3)) (3.11)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from requests<3.0.0,>=2.13.0->spacy==3.7.6->-r ../requirements.txt (line 3)) (2026.1.4)\n", + "Requirement already satisfied: pyasn1>=0.1.3 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from rsa<5,>=3.1.4->google-auth~=2.0->databricks-sdk<1,>=0.20.0->mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (0.6.2)\n", + "Requirement already satisfied: anyio<5,>=3.4.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from starlette<0.39.0,>=0.37.2->fastapi==0.115.0->-r ../requirements.txt (line 12)) (4.12.1)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from thinc<8.3.0,>=8.2.2->spacy==3.7.6->-r ../requirements.txt (line 3)) (0.7.11)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from thinc<8.3.0,>=8.2.2->spacy==3.7.6->-r ../requirements.txt (line 3)) (0.1.5)\n", + "Requirement already satisfied: shellingham>=1.3.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from typer<1.0.0,>=0.3.0->spacy==3.7.6->-r ../requirements.txt (line 3)) (1.5.4)\n", + "Requirement already satisfied: rich>=12.3.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from typer<1.0.0,>=0.3.0->spacy==3.7.6->-r ../requirements.txt (line 3)) (14.3.2)\n", + "Requirement already satisfied: annotated-doc>=0.0.2 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from typer<1.0.0,>=0.3.0->spacy==3.7.6->-r ../requirements.txt (line 3)) (0.0.4)\n", + "Requirement already satisfied: typer-slim<1.0.0,>=0.3.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from weasel<0.5.0,>=0.1.0->spacy==3.7.6->-r ../requirements.txt (line 3)) (0.23.2)\n", + "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from weasel<0.5.0,>=0.1.0->spacy==3.7.6->-r ../requirements.txt (line 3)) (0.23.0)\n", + "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from weasel<0.5.0,>=0.1.0->spacy==3.7.6->-r ../requirements.txt (line 3)) (7.5.0)\n", + "Requirement already satisfied: wrapt in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy==3.7.6->-r ../requirements.txt (line 3)) (2.1.1)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from aiohttp->datasets==3.0.1->-r ../requirements.txt (line 5)) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from aiohttp->datasets==3.0.1->-r ../requirements.txt (line 5)) (1.4.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from aiohttp->datasets==3.0.1->-r ../requirements.txt (line 5)) (25.4.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from aiohttp->datasets==3.0.1->-r ../requirements.txt (line 5)) (1.8.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from aiohttp->datasets==3.0.1->-r ../requirements.txt (line 5)) (6.7.1)\n", + "Requirement already satisfied: propcache>=0.2.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from aiohttp->datasets==3.0.1->-r ../requirements.txt (line 5)) (0.4.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from aiohttp->datasets==3.0.1->-r ../requirements.txt (line 5)) (1.22.0)\n", + "Requirement already satisfied: cffi>=2.0.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from cryptography>=38.0.3->google-auth~=2.0->databricks-sdk<1,>=0.20.0->mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (2.0.0)\n", + "Requirement already satisfied: pycparser in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from cffi>=2.0.0->cryptography>=38.0.3->google-auth~=2.0->databricks-sdk<1,>=0.20.0->mlflow-skinny==2.16.2->mlflow==2.16.2->-r ../requirements.txt (line 7)) (3.0)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from rich>=12.3.0->typer<1.0.0,>=0.3.0->spacy==3.7.6->-r ../requirements.txt (line 3)) (4.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from rich>=12.3.0->typer<1.0.0,>=0.3.0->spacy==3.7.6->-r ../requirements.txt (line 3)) (2.19.2)\n", + "Requirement already satisfied: mdurl~=0.1 in /Users/Bartley/miniforge3/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich>=12.3.0->typer<1.0.0,>=0.3.0->spacy==3.7.6->-r ../requirements.txt (line 3)) (0.1.2)\n" + ] + } + ], + "source": [ + "!pip install -r ../requirements.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1d. Configure Environment Variables (`.env`)\n", + "\n", + "One of the worst things you can do as a developer is hardcode filepaths and important keys that are unique to your environment. This makes it harder to reproduce and run your code in other environments & other machines. Environment variables let you configure things like file paths without hardcoding them in your source code. We store ours in a `.env` file, which is loaded automatically by the `python-dotenv` package.\n", + "\n", + "This project is going to use two variables:\n", + "- `MLFLOW_TRACKING_URI` — where MLflow stores experiment data (default: `./mlruns`)\n", + "- `DATA_CACHE_DIR` — where downloaded datasets are cached (default: `./.hf_cache`)\n", + "\n", + "The cell below copies the example file to `.env` if you haven't already. Projects like this one will often give you an example `.env` file so you know which variables you need to set to be able to run it." + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".env already exists — skipping copy\n", + "\n", + "Contents of .env:\n", + "MLFLOW_TRACKING_URL=./mlruns\n", + "DATA_CACHE_DIR=./.hf_cache\n", + "\n", + "\n" + ] + } + ], + "source": [ + "import shutil\n", + "\n", + "env_example = PROJECT_ROOT / \".env.example\"\n", + "env_file = PROJECT_ROOT / \".env\"\n", + "\n", + "if not env_file.exists():\n", + " shutil.copy(env_example, env_file)\n", + " print(f\"Copied {env_example.name} -> {env_file.name}\")\n", + "else:\n", + " print(f\".env already exists — skipping copy\")\n", + "\n", + "print(f\"\\nContents of .env:\")\n", + "print(env_file.read_text())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1e. Verify Installation\n", + "\n", + "Let's do a quick sanity check: import the key packages and print their versions. If any of these fail, go back and re-run the `pip install` cell above." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pandas 2.2.2\n", + "sklearn 1.5.2\n", + "mlflow 2.16.2\n", + "datasets 3.0.1\n", + "fastapi 0.115.0\n", + "matplotlib 3.9.2\n", + "\n", + "All packages imported successfully!\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import sklearn\n", + "import mlflow\n", + "import datasets\n", + "import fastapi\n", + "import matplotlib\n", + "\n", + "for pkg in [pd, sklearn, mlflow, datasets, fastapi, matplotlib]:\n", + " print(f\"{pkg.__name__:20s} {pkg.__version__}\")\n", + "\n", + "print(\"\\nAll packages imported successfully!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 2. Imports & Configuration\n", + "\n", + "Now that our environment is ready, let's import the libraries we'll use and load the project configuration.\n", + "\n", + "### Why a config file?\n", + "\n", + "Instead of scattering settings throughout the code (dataset name, model parameters, etc.), we keep them all in one YAML file: `configs/baseline.yaml`. This makes it easy to:\n", + "- **Change experiments** without editing code (just tweak the YAML)\n", + "- **Reproduce results** by saving/sharing the config alongside your model\n", + "- **Compare runs** by seeing exactly which settings produced which results" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "import os, sys, yaml\n", + "from pathlib import Path\n", + "from collections import Counter\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from datasets import load_dataset\n", + "\n", + "# Make sure Python can find our src/ modules\n", + "if str(PROJECT_ROOT) not in sys.path:\n", + " sys.path.insert(0, str(PROJECT_ROOT))\n", + "\n", + "from src.utils import set_all_seeds, get_env" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set the random seed\n", + "\n", + "Machine learning involves randomness (shuffling data, initializing weights, etc.). Setting a **seed** means the random number generator always produces the same sequence, so your results are **reproducible** — you (or a teammate) can re-run this notebook and get the exact same numbers." + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Random seed set to 42 — results will be reproducible.\n" + ] + } + ], + "source": [ + "set_all_seeds(42)\n", + "print(\"Random seed set to 42 — results will be reproducible.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load the configuration file\n", + "\n", + "Let's load `configs/baseline.yaml` and see what's inside. Don't worry about understanding every parameter right now — we'll explain each one when we use it." + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Configuration:\n", + " experiment_name: t0_setup_template\n", + " dataset: ag_news\n", + " test_size: 0.2\n", + " random_state: 42\n", + " tfidf: {'max_features': 30000, 'ngram_range': [1, 2]}\n", + " model: {'type': 'logreg', 'C': 2.0, 'max_iter': 200}\n", + " metrics: {'average': 'macro'}\n" + ] + } + ], + "source": [ + "cfg_path = PROJECT_ROOT / \"configs\" / \"baseline.yaml\"\n", + "\n", + "with open(cfg_path) as f:\n", + " cfg = yaml.safe_load(f)\n", + "\n", + "print(\"Configuration:\")\n", + "for key, value in cfg.items():\n", + " print(f\" {key}: {value}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**What these settings mean:**\n", + "\n", + "| Setting | Value | What it controls |\n", + "|---|---|---|\n", + "| `dataset` | `ag_news` | Which Hugging Face dataset to download |\n", + "| `test_size` | `0.2` | 20% of training data is held out for validation |\n", + "| `random_state` | `42` | Seed for the train/validation split |\n", + "| `tfidf.max_features` | `30000` | Keep only the 30k most common words |\n", + "| `tfidf.ngram_range` | `[1, 2]` | Use single words *and* two-word phrases |\n", + "| `model.C` | `2.0` | Regularization strength (higher = less regularization) |\n", + "| `model.max_iter` | `200` | Max training iterations for the classifier |\n", + "| `metrics.average` | `macro` | Average F1 equally across all classes |\n", + "\n", + "We'll revisit these in notebook 2 when we actually build the model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 3. Load the Dataset\n", + "\n", + "### What is AG News?\n", + "\n", + "[AG News](https://huggingface.co/datasets/fancyzhx/ag_news) is a widely-used benchmark dataset for text classification. It contains over a million articles in the original dataset, but we will use a sampled set that contains **120,000 training** and **7,600 test** news articles, each labeled with one of four categories:\n", + "\n", + "| Label | Category |\n", + "|---|---|\n", + "| 0 | World |\n", + "| 1 | Sports |\n", + "| 2 | Business |\n", + "| 3 | Sci/Tech |\n", + "\n", + "We're using Hugging Face's `datasets` library to download it. The library automatically caches the download so you only need to wait once.\n", + "\n", + "### What are train / validation / test splits?\n", + "\n", + "In machine learning, we split data into separate groups:\n", + "- **Train** — the model learns from this data\n", + "- **Validation** — used to tune settings and check progress *during* development\n", + "- **Test** — held back until the very end to get an honest measure of performance\n", + "\n", + "This prevents the model from \"cheating\" by memorizing answers it'll be tested on." + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset splits available: ['train', 'test']\n" + ] + } + ], + "source": [ + "# Where to cache the downloaded dataset (from our .env file)\n", + "cache_dir = get_env(\"DATA_CACHE_DIR\", \"./.hf_cache\")\n", + "cache_dir = str(PROJECT_ROOT / cache_dir)\n", + "\n", + "# Download the dataset (or load from cache if already downloaded)\n", + "ds = load_dataset(cfg[\"dataset\"], cache_dir=cache_dir)\n", + "print(f\"Dataset splits available: {list(ds.keys())}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Convert to pandas DataFrames\n", + "\n", + "Hugging Face datasets have their own format, but pandas DataFrames are more familiar and flexible for exploration. Let's convert." + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train: 120,000 rows, 2 columns\n", + "Valid: None (we'll create this split in notebook 2)\n", + "Test: 7,600 rows\n" + ] + } + ], + "source": [ + "train_df = pd.DataFrame(ds[\"train\"])\n", + "test_df = pd.DataFrame(ds[\"test\"])\n", + "\n", + "# Some datasets include a validation split; AG News does not,\n", + "# so we'll create one ourselves in notebook 2.\n", + "valid_df = pd.DataFrame(ds[\"validation\"]) if \"validation\" in ds else None\n", + "\n", + "print(f\"Train: {train_df.shape[0]:,} rows, {train_df.shape[1]} columns\")\n", + "if valid_df is not None:\n", + " print(f\"Valid: {valid_df.shape[0]:,} rows\")\n", + "else:\n", + " print(\"Valid: None (we'll create this split in notebook 2)\")\n", + "print(f\"Test: {test_df.shape[0]:,} rows\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Peek at the data\n", + "\n", + "Always look at a few rows before doing anything else. This helps you spot issues like missing values, weird formatting, or columns you didn't expect." + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textlabel
0Wall St. Bears Claw Back Into the Black (Reute...2
1Carlyle Looks Toward Commercial Aerospace (Reu...2
2Oil and Economy Cloud Stocks' Outlook (Reuters...2
3Iraq Halts Oil Exports from Main Southern Pipe...2
4Oil prices soar to all-time record, posing new...2
5Stocks End Up, But Near Year Lows (Reuters) Re...2
6Money Funds Fell in Latest Week (AP) AP - Asse...2
7Fed minutes show dissent over inflation (USATO...2
8Safety Net (Forbes.com) Forbes.com - After ear...2
9Wall St. Bears Claw Back Into the Black NEW Y...2
\n", + "
" + ], + "text/plain": [ + " text label\n", + "0 Wall St. Bears Claw Back Into the Black (Reute... 2\n", + "1 Carlyle Looks Toward Commercial Aerospace (Reu... 2\n", + "2 Oil and Economy Cloud Stocks' Outlook (Reuters... 2\n", + "3 Iraq Halts Oil Exports from Main Southern Pipe... 2\n", + "4 Oil prices soar to all-time record, posing new... 2\n", + "5 Stocks End Up, But Near Year Lows (Reuters) Re... 2\n", + "6 Money Funds Fell in Latest Week (AP) AP - Asse... 2\n", + "7 Fed minutes show dissent over inflation (USATO... 2\n", + "8 Safety Net (Forbes.com) Forbes.com - After ear... 2\n", + "9 Wall St. Bears Claw Back Into the Black NEW Y... 2" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each row has:\n", + "- `text` — the news article content\n", + "- `label` — an integer (0–3) representing the category\n", + "\n", + "Notice the labels are numbers, not names. That's typical — it's more efficient for the model. We'll map them to human-readable names when we need to." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 4. Exploratory Data Analysis (EDA)\n", + "\n", + "EDA is a **critical** first step in doing machine learning and data science. \n", + "\n", + "EDA is about asking questions: *How big is the data? Are the classes balanced? How long are the texts?* These answers shape decisions you'll make later.\n", + "\n", + "### 4a. Basic Statistics\n", + "\n", + "Let's compute some summary stats. We'll count rows, measure text lengths (in tokens — roughly, words), and look at the distribution of labels." + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== TRAIN ===\n", + " rows: 120000\n", + " avg_tokens: 37.8\n", + " median_tokens: 37.0\n", + " label_counts: {0: 30000, 1: 30000, 2: 30000, 3: 30000}\n", + "\n", + "=== TEST ===\n", + " rows: 7600\n", + " avg_tokens: 37.7\n", + " median_tokens: 37.0\n", + " label_counts: {0: 1900, 1: 1900, 2: 1900, 3: 1900}\n" + ] + } + ], + "source": [ + "def describe_dataset(df, text_col=\"text\", label_col=\"label\"):\n", + " \"\"\"\n", + " Compute basic stats for a text classification DataFrame.\n", + " \n", + " 'Tokens' here means words (split on whitespace). This is a rough\n", + " approximation — real tokenizers are more sophisticated, but this\n", + " gives us a good-enough picture for EDA.\n", + " \"\"\"\n", + " # Count words in each text by splitting on spaces\n", + " lengths = df[text_col].astype(str).str.split().map(len)\n", + " # Count how many examples belong to each label\n", + " counts = Counter(df[label_col])\n", + " \n", + " return {\n", + " \"rows\": len(df),\n", + " \"avg_tokens\": round(float(lengths.mean()), 1),\n", + " \"median_tokens\": float(lengths.median()),\n", + " \"label_counts\": dict(sorted(counts.items())),\n", + " }\n", + "\n", + "\n", + "print(\"=== TRAIN ===\")\n", + "train_stats = describe_dataset(train_df)\n", + "for k, v in train_stats.items():\n", + " print(f\" {k}: {v}\")\n", + "\n", + "print(\"\\n=== TEST ===\")\n", + "test_stats = describe_dataset(test_df)\n", + "for k, v in test_stats.items():\n", + " print(f\" {k}: {v}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**What to look for:**\n", + "\n", + "- **Label counts** — Are they roughly equal? AG News is well-balanced (each class has ~30k examples). If one class had far fewer examples, we'd need strategies like oversampling or class weights to avoid the model ignoring the minority class.\n", + "- **Token lengths** — The average and median give you a sense of typical document size. If there's a huge gap between them, it suggests some very long outliers that might need truncation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4b. Token-Length Histogram\n", + "\n", + "A histogram shows the *distribution* — not just the average, but the full shape. Are most texts about the same length, or is there a wide spread? Are there extreme outliers?\n", + "\n", + "This matters because our TF-IDF model (in notebook 2) treats each text as a \"bag of words.\" Very short texts have fewer words to work with; very long ones might be noisy." + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved plot to: /Users/Bartley/Documents/personal_dev/h4la/repos/data-science/tutorials/setup_template/reports/eda_token_lengths.png\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAskAAAGJCAYAAAB4ha4cAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABvnElEQVR4nO3dB3hT1fsH8Le7bChQhkDZeyMgIku2TMEBiCAbBGQoyx8blSV7Olj+maKIMmTvPQRZCoKlgJS9Kd35P9+DNyZpZknbNPl+eELWyc3NyW3y5tz3vsdLp9PphIiIiIiI9Lz/u0hERERERAySiYiIiIjM4EgyEREREZEJBslERERERCYYJBMRERERmWCQTERERERkgkEyEREREZEJBslERERERCYYJBMRERERmWCQTJTCRo8eLV5eXsnyXLVr11Ynza5du9Rz//DDD8ny/B988IHkz59fXNmTJ0+ka9eukjNnTtU3/fv3T7F1uXz5slqHxYsXO/Q4tMfj8HhPh20uffr0Sf48V69elcDAQNm/f78kJ/w94TU64/PAXbVp00beeeedlF4NSoUYJBM5kRacaCd8aebOnVsaNmwoM2fOlMePHzvlea5fv66C65MnT4qrceV1s8cXX3yh3sdevXrJ//3f/8n7779v8zFxcXHqfcZ7/uuvvzr8nMuXL5fp06eLq0ruH1OOioiIUNsc1jOljB07VqpWrSrVq1fX95c9J08VHR0tM2bMkAoVKkjGjBklc+bMUqpUKenevbv8+eefTv3cGTJkiPz444/y+++/O2ntyVP4pvQKELkjfGEWKFBAYmJi5MaNG+pLEyOSU6dOlV9++UXKli2rbzt8+HAZOnSow18IY8aMUaNI5cuXt/txW7ZskaRmbd2++eYbiY+PF1e2Y8cOeeWVV2TUqFEOPSY8PFy95mXLlknjxo0dDpLPnDmTYNQ6JCREnj17Jn5+fg4tz9MgSMY2BykxMnr79m1ZsmSJOkGJEiXUDyxDw4YNUyPa//vf/5z63OfPnxdv78SNdyXH54ElrVu3Vj8o27ZtK926dVOflQiO169fL6+++qoUL17caZ87CMRffvllmTJlinz33XdOfiXkzhgkEyUBBEn4UDb8gkQg1bRpU2nevLn88ccfkiZNmud/hL6+6pTUQUTatGnF399fUlJqCPZu3bolJUuWdOgxS5culYoVK0rHjh3l008/ladPn0q6dOlsPs5WO21vBLk2vP/4G27WrJm6niNHDmnfvr1RmwkTJki2bNkS3G4IPyAxwurIex4QEJDo9U6pz4OjR4+qYPjzzz9Xfy+GZs+eLQ8ePHD6cyLdAj98586dmyzpN+QemG5BlExef/11GTFihISFhakvVWs5yVu3bpXXXntN7YLEB3qxYsX0XyYYla5cubK63KlTJ/1uWy1vFSNppUuXluPHj0vNmjVVcKw91lIOItIF0AZ5uAjaEMgjx9Ke3EfDZdpaN3M5yQgUP/74Y8mbN6/6wsdr/fLLL0Wn0xm1w3L69Okja9euVa8PbbF7dtOmTXYHv126dFEBDIKQcuXK6Uf+tHXHc4SGhsqGDRv0624rrxcjvT/99JM+7xHXf/75Z4u5sZcuXZI33nhDMmTIIO+9957qOzwftgvtObU+spSTjBE3PFf27NnVjy30mT0jlBi5q1GjhnqP8fxNmjSRs2fPirMguMFouPZeFi5cWCZOnGi090B7TXiPv/76aylUqJBqi+0GwZOp1atXqx8teM/wvqOvDbcjLA/9ABhJ1PoQf1eG/vnnH2nZsqV6D9D+k08+Udu9oZUrV0qlSpVU3yAFoEyZMiolwBZsk0i1cDT40rZp7H3Atox+0LZn9A9GVLNmzareY6yXuXQX079LLeULudEDBw5UrxXv95tvvqlGvO05RuH7779XAWyePHlUv9etW1cuXryY4LnnzJkjBQsWVOtXpUoV2bt3r115zvgbAKSmmPLx8VGv2fS969y5s/rb1f7uFy5caLTe1j53oH79+uqzBp+tRPbiSDJRMkJ+K4JR7ObELkZzELRgxBkpGUjbwJcCvqC0A4KwKxe3jxw5UuXvIegBfKFq7t69q0azEbhh5ApfLtbgCxFfKsjdQzCJ/Nh69eqp/D5txNse9qybIQTCCMh37typAljsJt28ebMMGjRIfTFOmzbNqP2+fftkzZo18uGHH6pABnne2G175cqVBF+shhC44osb/YigBKkwCL4QXCCw69evn34X+YABA1RwgMAdtADMEqTP4GA/9DV+ZOB5EPS0a9cuQdvY2FiVn44fQAiC8AMGj3n48KFcu3ZN/3qtBVunTp1S/YpRefQxgiQEHevWrVPvoyV4bRjpxvMjcMXehXnz5ql1OXHixAsfUInl1apVS71vPXr0kHz58smBAwfUXhSkopjmXCPFBDn6aIttb9KkSdKqVSv5+++/9Xsc8OPh3XffVcHq+PHj5f79+2o7eemll/TLwfuD14EccgSCWAYYpjQhGMbrRiCLft+2bZva9Y4AHY8DBE/Y9Y+AEP0D2OODvztsH5YgTQDBvbYcR2EPE4JSbJcYadbeBwTn+NvADymMLiOAf/vtt9UILH7c2NK3b1/JkiWLGj3FDwn0P55j1apVNh+LUW+kcOCHBLZNvDdYj8OHD+vboM+xPGyL+JvBc+BHCJ4Tfz/WII0I8HeCQNnanrSbN2+q9CftBwXeb/zYw3bw6NEj9aPMns8d/NDCZxneT2wnRHbREZHTLFq0CMOfuqNHj1pskylTJl2FChX010eNGqUeo5k2bZq6fvv2bYvLwPLRBs9nqlatWuq++fPnm70PJ83OnTtV25deekn36NEj/e3ff/+9un3GjBn620JCQnQdO3a0uUxr64bHYzmatWvXqrafffaZUbu33npL5+Xlpbt48aL+NrTz9/c3uu33339Xt8+aNUtnzfTp01W7pUuX6m+Ljo7WVatWTZc+fXqj1471a9Kkic5eTZs21VWvXl1//euvv9b5+vrqbt26leC1Yx2GDh2aYBl4PsN+0YSGhiboy5o1a+oyZMigCwsLM2obHx+fYDvE4+Hx48e6zJkz67p162b0mBs3bqjt0fR2U9p2snr1aottxo0bp0uXLp3uwoULRrfj9fr4+OiuXLli9JqyZs2qu3fvnr7dzz//rG5ft26d/rYyZcro8uTJo9Zfs2vXLtXOsL/wt4Lb8LdkSuv3sWPHGt2Ov8FKlSrpr/fr10+XMWNGXWxsrM4R2B7t2QZLlSpl9HcCeJy3t7fu7NmzCdpHREQYXcf2Wrp0ad3rr79udLvp36X23terV89omxgwYIB6Hx48eGDz86BEiRK6qKgo/e34HMDtp0+fVtdxH96/ypUr62JiYvTtFi9erNqZvk5TWC/tcypHjhy6tm3b6ubMmZNgm4YuXbrocuXKpbtz547R7W3atFHbrtZP1j53NEWLFtU1btzY6roRGWK6BVEywyihtSoXSLEA7LJP7EFuGH3Gbkd7dejQQY3Mat566y3JlSuXbNy4UZISlo/dqx999JHR7RjFRQxhWikCo9sY/dNgtBC7xTH6aOt5MGKLkUINRivxvBgF3r17d6LWHyP2GPk2XC5GtrVd1uYkdsQRsLt8z549atczRmoNWauUgFFSjJhjPe/cuaM/oe8xuoqR/BeFkXmM4GEk0fA58J5hJBfrbQgjxGir0Ub/tPcSB2KdPn1abZuGI+sYrcbIsqN69uxpdB3PZ7jd4O8uMbvjsQ2A4WtxBF6PuRx4wz04GEHHiC7W+bfffrNruRhRNdwm8Fi8D0jrsQWfHYb5yqbvzbFjx9Trxt4ww1FgjDbb0w9YL/zdfPbZZ6r9ihUrpHfv3mqEGduFlpOMzwBUpUCuNy4bblfYM4A+sbc/QNs2iezFIJkomSEoMwxITeFLArsgUasXaRLYjY+Ay5GAGbujHTkop0iRIgm+xJBPmtR1dvGFjdJppv2B3afa/YZMA0Ptiw9BhK3nwWs0rQJg6XnshV3X2N2Oo+eRyoHTvXv3VOCJXcmmEFDY2hVtjRakIDfXEX/99Zc+Lx67qw1PSP1Bis2LwnMgn9Z0+QiSwfQ5TN9LLbjS3kvtPcF2aMrcbdYgr9Y0bcZ0u0EKT9GiRVWaEt4j/BCxN98dTHPo7YXUH3OQVoE0A6x7UFCQPq0EgaE9bPXvizzW0nuD7dvetB38kEcePVJa8IMIgTJer5Z6ov0oRMCM3HXT7UobBHBk28V75Mll98hxzEkmSkbIO8WXnLUveYwgYdQNo3vIycQXNYIxBDgIaDD6Z4sjecT2svTlgtEpe9bJGSw9T2IDlBelBcLmDkDSgloc2GQYGCS2XNeL0H5gIS8ZI+qmnFFdBc+Bg6MGDx5s9n4EoCn1XtqzfQYHB6scfIxwYg8GTosWLVIj2YYHeJrScuHtCT7t/VvFAXDIR8aBt6jGgL062POB9UEutz1epH+T++8Mrw+DAdgLg4PyECjjoDttu8VxFcinN8cw99wWvEemAwJE1jBIJkpGWu1U7Cq0BoEUDiDCCbWVMcEFRl0QOGNkztmjIdpIo+GXIUZFDb+AMJpkrjQTRpUMA0FH1g27V3EQFdJPDEeTtckEtAN8XhSWgwPe8KVrGKS+yPOgCgYOTMOoF3aZG8Lz4CBNBDSog22LvX2m9TNqKjtCS1FBIKiN7DobngN7SZy1fO09MVdVwfQ2Z/09YO8Ldu3jhPcQo8tfffWVqkpj6YctRl0R6GJ7cBakGGAEGQG7YYk3BMmuwPC9qVOnjtGBqdj75Ejgagg/BPBYfB4hLQIjxvhcwA9xW9uVrW0A64aKPfjxQWQvplsQJRMcxT5u3Di1exW5e5Zgd70prTh+VFSUOtdq6zqrnigK7BvmSaPUFCoSGE6KgSDo0KFD6kh7w13CpqXiHFk3lELDFyBqoxpClQd86Tk6KYe158GkLoZH9uNLc9asWSrf1TTIdWQUGSOnyOE2PKE8G5ZpLuXCHPSZPbvRETRgdBHlr1DRw95RPvwoQ+42fmwhPcSUaWmwxMBrPnjwoArsTGFbQH87Amk4SCvBtongW4P8ceQqG0KVEO15EkvLLdbgx5QW7Gl/d5YCO9RER56us2AkF9u/YYk6BJ8oNecK8Hoxgo7JgQzfV2zv9oyoIwg23X619w/bEH6QY1tHP2B0GT8azP0wNNxubX3unDt3TiIjIy1W2iEyhyPJREkAu2oxSokvEJQwQoCMA4IwAoOSYdYmC0ApI6RboMwT2iPnDrtckSeJcl1awIoDjebPn69GWvAFgTxYS/mNtiDnEctGnh/WF+WiMHJmWKYOOdIInhs1aqQCIpQdQ71nwwPpHF03jNhhJAqj5AgCULsYKSU4aBGlnUyXnVg4iAkjgij5hvrRyJvEa0E5KLxWazniliAgwI8X1AQ2ByNWKMOFA4sw0Yg1qIGLAB51bVHvFYG7NjGFKZS9w3uFZeJ1oV/Rd0jNsTQVOAJk5LNidBuPw65tBCEIVPA4pIuY/lAxB8GKuSmDsSscZfuwbaN8IfoZrwkHwiGgRV9jHVHizBEI6lu0aKHWD9smAjCsJ4Jnw8AZI7k4+A19iLQObM9o40juNrZv/EBFWhP+1rCHBD+i8B5rueuWYB2xDaMkGfr6ReFvH3uQ8LeGUoL4DEBNYvxNYo9ISsOIO+pQY/tGf+HzAO8vUiTwN2trVBfTQ+N14UcwDgrE+4XSgUhrQX4y/ia1lA+Uo8MeNHyG4PMI7zPeJ/xdYS+UNqhg63MHn7/4MYWUICK7GdW6IKIXopVf0k4oWZYzZ05d/fr1VRklw1JjlkrAbd++XdeiRQtd7ty51eNxjhJJpqW1UDKrZMmSqtyYYekjlFZCuSlzLJV8WrFihW7YsGG64OBgXZo0aVRJMnPlmKZMmaLKxQUEBKiyZ8eOHUuwTGvrZloCDlDeC+Wp8Dr9/Px0RYoU0U2ePNmofBVgOb17906wTpZK05m6efOmrlOnTrps2bKpfkV5MXPlouwpAXf8+HG1PiNGjLDY5vLly6oNXpv22lEizZwnT57o2rVrp8q0GZY3M1cCDs6cOaN78803VfvAwEBdsWLFjNbFtASc4fvdsGFDVToLjytUqJDugw8+UO+jNdp2Yum0d+9e/XuJ7ahw4cKqj9HXr776qu7LL79UJcwMXxPeY1PmyritXLlSV7x4cbXNoQTaL7/8omvdurW6zdCBAwdUSTc8r+FyLPW76d/dDz/8oGvQoIH6G8Ay8uXLp+vRo4cuPDxcZ8+2hW39//7v/xwuAWdum4YFCxaovwW8brxWvKem62ytBJxpGUrtPcS5rc8D01J/lrbDmTNnqufHOlapUkW3f/9+9R40atRIZ6u/JkyYoJ4b5d3Qd1myZFHl7fA+mGuPfsqbN6/6jMBnat26dVW5RXs+d6Bq1aq69u3bW10vIlNe+M/+kJqIiChlYXQXI+GuNHsaJre4cOGCOujOUyGPG+8LJnRBKoarwB4W7EHB6LOWukZkD+YkExGRS0L+tGkuM6Ygxu56W1MfJzfMbIeZ97SZMd0d8ntNx9iQP470B1d7b5CygWMFGCCToziSTERELgl5rqhqgBJgOJAP+dDIOc2UKZM6kMvaVOSUtPBjBdNRY6psvA8YpV2wYIHK30bevyN12olcFQ/cIyIil4QqBzgA8Ntvv1WVDHAwFg5qw8ggA+SUhYNfcdAqDiTF6DEOvkNNabw3DJDJXXAkmYiIiIjIBHOSiYiIiIhMMEgmIiIiIjLBnGQnlr5BEXQUMXf2lMFERERE9OJQlQUzzOJgYMysaQ2DZCdBgGxp5i0iIiIich1Xr15Vs2tawyDZSbRpbUNDQ9Xc8w0aNBA/Pz9nLd5ja6RiimL2JfvS1XDbZF+6Km6b7EtXFONC3+eYPh6DmlrcZg2DZCfRUizQ6ZgfPmPGjCm+IbjDH5XL9mVEhEjlys8vHz0qkjatuDKX7stUiP3JvnRV3DbZl64oxgW/g+xJjWWQTJQYmGnq3Ln/LhMREZFbYXULIiIiIiITDJKJiIiIiEww3YKIiIjsKp0VGxsrcXFxHpNH6+vrK5GRkR7zmt2hL318fNRzOaMcL4NkIiIisio6OlrCw8MlAgcte9CPgpw5c6pSYZz/IHX1JQ4SzJUrl/j7+7/QchgkExERkdXJslDeFCN0mIABgYcnBI143U+ePJH06dPbnHSCXKMvEYzjB93t27fVNlukSJEXej4GyUSJgS+IkJD/LhMRuSkEHQhyUFsWI3SeAq8Zrz0wMJBBcirqyzRp0qgyc2FhYfrnTCwGyUSJgS+Ky5fZd0TkMTiaSp62rXL/ARERERGRCQbJREREREQmmG5BHglJ/Zi/3V6YSjN79uz/3fDsmUjNms8v79mDJKgkWEsiInJ1u3btkjp16sj9+/clc+bMsnjxYunfv788ePAgpVct1cufP7/qS5xSAoNk8sgAuX2nrnLvsf2ljIIypJWli779L1COjxc5duy/y0RE5HI++OADWbJkifTo0UPmz59vdF/v3r1l7ty50rFjRxXYOsu7774rb7zxhqSE5s2by8mTJ+XWrVuSJUsWqVevnkycOFFVJYHRo0fLmDFjEjwOB2Q+ffo0BdbYtTFIJo+DEWQEyNmrtZZ0QTlstn9676bcPvijepzRaDIREbk8VOVYuXKlTJs2TVU+AExqsXz5csmXL5/Tnw/PoT1PcsOI9qeffqpqBP/zzz/yySefyFtvvSUHDhxQ9+N6z549jR5Tt25dqVy5sqSUuLg4VVLQFQ8Mdb01IkomCJAzBuexebInkCYi8kgYfbR0ioy0vy1S2OxpmwgVK1ZUgfKaNWv0t+EyAuQKFSokKFU2fvx4KVCggKRLl05ee+01+eGHH4zabNy4UYoWLaoCYQSll00qHWFUGmkXmkuXLkmLFi0kR44cqk4wAtJt27YlSCv44osvpHPnzpIhQwa1bl9//bXDr3XAgAHyyiuvSEhIiLz66qsydOhQOXTokJrxDvD8mNRDO928eVPOnTsnXbp0sbjMl19+Wb788kv99ZYtW6oSa6h7DNeuXVNB7sWLF9V1pJ106NBBjWRjhLpx48by119/JeifX375RUqWLCkBAQFy5coVNfrdrFkz1a/o/2XLliWogYyRcPQNHoPR8Y8++kiSEoNkIiIiSpz06S2fWrc2bhscbLlt48bGbfPnN98ukRB8Llq0SH994cKF0qlTpwTtECB/9913KjXj9OnT8uGHH6qAb/fu3ep+zBjXqlUrFcwhraFr164qELUGwSTSL7Zv3y4nTpyQRo0aqccjMDQ0ZcoUFZCiDZ63V69ecv78ef39tWvXVukj9rp3754KNBEsI6g159tvv1UBf40aNSwup1atWirvWgtU9+7dq4Lcffv2qdvQNy+99JIULlxYXcc6Hjt2TAXBBw8eVI9p2rSpPlAHzNyINBA8/9mzZyU4OFg9Dv27c+dO9cMEqTAInDU//vij2hvw1VdfqaB77dq1UqZMGUlKDJKJiIjIrbVv314FdZhgAqf9+/er2wxFRUWp0VwE0A0bNpSCBQtKu3bt5L333lOBGcybN08KFSqkAtpixYqp+2wFruXKlVM50aVLl1YzwI0bN04tA0GkIQTSCI4RbA4ZMkSyZcumAkYNRlCRRmELHotR8KxZs6pA/OeffzbbDiknCKKtjSJrwTn6DmkRp06dUjMu4nVrgTPOEUgDgle8LgS/CLzx2vEcSP3YsGGDaBAwIwhGAI9+xGj0r7/+Kt98840aCa9UqZIsWLBAnhnsYcBrweg38qzRF1WqVJFu3bpJUmJOMhERESXOv7vczfLxMb5uMCqYgGk+qpMna8LxJE2aNFG7+jGyicsIQg0hXQAjnPXr1ze6HbO2aWkZf/zxh1StWtXo/mrVqtkcSUaaAILE8PBwiY2NVcGf6Uhy2bJl9ZeRvoCA0HAkFSPc9hg0aJAKfPFjAAfpYSR8/fr1CaYS/+mnn+Tx48fqwEVratSoodphhBu5zQiIEThPmDBBP5KM59T6x9fX16iPEKwjEL5w4YL+NgTahq9XexyCY03x4sWN0lbefvttmT59uvrxgtF4/KjAiDwel1QYJBMllskHLBGRx0mXLuXbOpBy0adPH3V5zpw5Ce7XcmwRzCJ9APnJuA15vC9yIB4Oltu6davK68UoMZaFg+kQfBsyTYlAUIt1cBSCf5yQRlGiRAmVj428ZNNgHqO9SINArrQ1mTNnViPCGDFG+gR+RNSsWVNV8UDgi9FjbSTZXugD06DdFrwOpJ8gnxv9iVH3yZMnqyDdUjrJi2K6BVFi4AP89u3npyT4MCciIufC6CMCU+zqRzqFKcODyBDM4oRRS5wjQAMEnUeOHDF6HAJQa5DagZSMN998U+XQYoTY9GC/pKIF2UglMRQaGqpSOWylWmgQBKP9nj171ChyUFCQ6ovPP/9cpYAgIAfchpHyw4cP6x979+5dFdxiNNkSjBrjccePH9ffhseY1ppGcI3R45kzZ+qDduSOJxWOJBMREZHb8/HxUbv1tcumUFUCo76oEIHgEvmy169fl99//10yZcqk0hJQPg35yEgvwEF7COps1VhGHjKqaSC4w+jpiBEjEjVCjLQJjHDj4EJzEJgePXpUVeRAZQlU1cBzIf/ZdBQZedcIblF5wh61a9eWWbNmqbQVBLTabbNnz1ZpEIavFZU8kCuMPG70KQ5sxHpbqx2NABo/YpC7jbxvpFBgAhHDEXz0M/KikcqBqhlLly5V96OSR1JJ0ZFkvNEohYJOxJGNKCtieCSnlliOgt/IacEuj9atW6uSJYbwqw/5Reg0LAcbL36RGMIvDpSBwa9E/Co0t1Fj9wvKsAQGBqo3wfTXIhEREaVemD0VJ0twUB0CS8QnpUqVUmkRSL9ASTLAAWOosoDKCkhBQBUMHOxnzdSpU1XQiqAbgTJGsRGPOAqxDnKaLUEMhGAcdY8RdGKUGHm/SEdA7KNBgI4YCKPb5n4sWMpLjo+PN0qrQJCMoBXnhlBFBLnFSOVAcI4ccORE20qJwONQ1g3PgQoi3bt3VzGdYdoHDuyrXr26el1Iu1i3bp2KD5OKlw5rn0Lwq6FNmzYqUEZQiwLYZ86cUTX7cGQmoAQKNlC8ofglh3wiFJzG7gvAG1S+fHm1+wK5KdiA8GsLv2K0DRe7FXBUKX4B4pcfyrDgFwqWq+1yWbVqlXocNngEyEgOX716tQraDd8kSzDRBNbvzp076ihQ/GJKqhwZT4FdYqhH6ey+xK/rNp17Sv4mH6o6yLY8unVNLm+YKysXzle/yBUccav9Av/1V5efljqp+tJTsT/Zl560bWKwCt+jCBQxiOQpEBTiux1BtStOdJGaxCdzX1rbZrV47eHDh1Z/MKV4usWmTZuMriMQRkCK3RdICscLQAkQzIrz+uuv639pIOcFOUAoE7JlyxYVVOMXBZLPETDjlyBKoOBoUhxBicAXHYVdJIDHI5BFvT0tSMYvPQTWWt1EPAZBNHZJmKuBiPwewxwfdDpodQAN6wFS4iRVX+KHlb+/n/h54w/A9i4vtEN7PE6/LlFR4vdv3cwYbAdJeHStM3C7ZH+6Km6brt+fWBbG0xDoJCZNILXSxhC1106ppy/xHHgubLumo+WO/G241Dc7gmJAQjggWMaLQU08DXJhsLsDydoIknGORHjDozMR+GIEGgWqUbYFbQyXobXBaDIgkR/PNWzYMP39+KWDx+Cx5mBXjLn5z5HYjl0eOPKSnCMp+nJwnx7/Xrphu3EWb5EiPVQumz6fLTJSmv579+bNmyUulYyucLtkf7oqbpuu25/ID8XeWlR6MK3I4AlQ/oxSV19iO0WZPRxoaJp+izJ/qS5IRtSPoBW5JkiNgBs3bqiRYMM6eYCAGPdpbUzLl2jXbbXB6C86EVMoYpTQXJs///zT7PoioB44cKD+OpaFo18xRSWS51Eihbu1Xwx+IOGD3tl9iV0wnXr3l5CG3SRDttw22z++c13CNn8ji+ZM1+elGU6PqvZGuHiFi6TqS0/F/mRfetK2iV3XmAkNxwV5UroFRiIR1OG4KUfLlVHK9iW2WRzUh6wEc+kWqS5IxsF5yEfWpjl0dUiCN0yE12gfSjhnMOIczu5L7HqJjo6RmHiRWDuOXUU7tMfj9OthsD7qtlQSeHK7ZH+6Km6brtufGERCYIM9rJ6Um6ulBWivnVJPX+I58Fzm/g4c+btwiXcdB+PhyEekKuTJ89+BVNi9gyFz0zp5qG6B+7Q2ptUutOu22iBhG780UHQbAZC5NtoyiIiIPFkKHudPlCLbqndKvwgEyJgacceOHf/tyv4XSogg4kc1Cg2qTaAMilbzD+coJG04dSN2NSEARmFwrY3hMrQ22jKQ0oHnMmyDXz24bmu6SSIiInemjbw5kstJlJK0bfVF96b4pnSKBSpX/PzzzypPRcshRmkOjPDiHHX+kPuLg/kQ+Pbt21cFrjhoDxo0aKCC4ffff18mTZqkljF8+HC1bC0dAqXfUPB68ODBalpKBOTff/+9ql6hwXOgUPjLL78sVapUUSXgnj59qq92QZRA2rTsFCJye9jTimODtMEoHJzuCTm6GCzD3mzktzLdInX0JQZfESBjW8U2a28daJcMkjGrCpgrRI0i14AybehQTCKCkms4SGru3Ln6tugApGqgmgWCZ9RXRrA7duxYfRuMUCMgxiw6M2bMUCkdmLPccFpKzEF++/ZtGTlypAq0UUoOJepszWlOHgoH6hkcvEdE5M601EPDvbbuDgEXDu7HoJ0n/Chwp77MnDmzU9JlfV09ZwRHJWImPJwswZSEKJ5uDQLxEydOWG2D1A+ciIiI6D8IbDCNMeYy8JR5APA6UUIMFRJ4IH7q6Uss/0VHkF2uugURERG5NgQfzgpAXB1eJ2rsYrCOQbJn9qVLVLcgSnUiI0WaNHl+wmUiIiJyKxxJJkqMuDgRLcUHl4mIiMitcCSZiIiIiMgEg2QiIiIiIhMMkomIiIiITDBIJiIiIiIywSCZiIiIiMgEg2QiIiIiIhMsAUeU2Gmp7ZgxkoiIiFInjiQTEREREZlgkExEREREZIJBMlFiYCrqt99+fuK01ERERG6HQTJRYmAq6h9+eH7itNRERERuh0EyEREREZEJBslERERERCYYJBMRERERmWCQTERERERkgkEyEREREZEJBslERERERCY4LTVRYqRNK/LkyX+XiYiIyK0wSCZKDC8vkXTp2HdERERuiukWREREREQmGCQTJUZUlMgHHzw/4TIRERG5lRQNkvfs2SPNmjWT3Llzi5eXl6xdu9boftxm7jR58mR9m/z58ye4f8KECUbLOXXqlNSoUUMCAwMlb968MmnSpATrsnr1ailevLhqU6ZMGdm4cWMSvnJK9WJjRZYseX7CZSIiInIrKRokP336VMqVKydz5swxe394eLjRaeHChSoIbt26tVG7sWPHGrXr27ev/r5Hjx5JgwYNJCQkRI4fP64C7NGjR8vXX3+tb3PgwAFp27atdOnSRU6cOCEtW7ZUpzNnziThqyciIiIiV5WiB+41btxYnSzJmTOn0fWff/5Z6tSpIwULFjS6PUOGDAnaapYtWybR0dEqwPb395dSpUrJyZMnZerUqdK9e3fVZsaMGdKoUSMZNGiQuj5u3DjZunWrzJ49W+bPn++EV0pk3u3bt9UPOXtkzJhRsmfPzq4kIiJKBqmmusXNmzdlw4YNsgS7t00gvQKBbb58+aRdu3YyYMAA8fV9/tIOHjwoNWvWVAGypmHDhjJx4kS5f/++ZMmSRbUZOHCg0TLRxjT9w1BUVJQ6abRAJyYmxuicEi+p+jIuLk78/f3Ezxt/APE226Md2uNx+nWJiRE/w/VMxDreuXNHun3YV+4/ibCrfZb0aeWbubMkW7ZsDj8Xt0vnYn+yL10Vt032pSuKcaHYyJF1SDVBMoJjjBi3atXK6PaPPvpIKlasKEFBQSptYtiwYSrlAiPFcOPGDSlQoIDRY3LkyKG/D0EyzrXbDNvgdkvGjx8vY8aMSXD7zp07JW3atGokmpwjKfpycJ8e/16y/B7rZfEWKdJD/vjjD3UCn8hIafrv3Zs3b5a4wMBErUe3D953qP2RI0fkRXC7dC72J/vSVXHbZF+6oq0uEBtFRNg3MJWqgmSkS7z33nvqwDpDhiPAZcuWVSPGPXr0UEFsQEBAkq0PgnHD58ZIMg4KRDrI4cOHpX79+uLnp401UmJ/7eEPytl9GRoaKp1695eQht0kQ7bcNts/vnNdwjZ/I4vmTP/vB9fTp0Z7HRJTM9mR9TC7Di7Ql56K/cm+dFXcNtmXrijGhb6D7E1xTDVB8t69e+X8+fOyatUqm22rVq0qsbGxcvnyZSlWrJjKVUaqhiHtupbHbKmNpTxnQABuLgjX3nycp/SG4C6c3Zc+Pj4SHR0jMfEisXYcu4p2aI/H6dfDYH3UbYlYP0fWw+w6JAK3S+dif7IvXRW3TfalK/JzgdjIkedPFXWSFyxYIJUqVVKVMGzBQXne3t4SHBysrlerVk2VmjPMQcGvGQTQSLXQ2mzfvt1oOWiD24nMwlTUt249P3FaaiIiIreToiPJT548kYsXLxrtfkaQi/xiHISnDYujhvGUKVMSPB4H3CG1ASkOyFfGdRy01759e30AjAP5kDuM8m5DhgxRZd1QzWLatGn65fTr109q1aqlnqNJkyaycuVKOXbsmFGZOKIE01Kz0gQREZHbStEgGYEoAlyNluPbsWNHWbx4sbqMgFWn06k6xqaQ7oD7UfcYlSaQq4kg2TBXOFOmTLJlyxbp3bu3Go1GZYCRI0fqy7/Bq6++KsuXL5fhw4fLp59+KkWKFFGVLUqXLp3EPUBERERErihFg+TatWurANgaBLOGAa0hVLU4dOiQzefBAX3Ia7bm7bffViciu6D8n/ZjDJVUkvAgUSIiIkp+qeLAPaKUFhMdLWFhYfrrXhERUnDuXHX57549RWeQl8xJP4iIiFI/BslENkQ9eSiXQ/+W/p+O1lc0CYyLE23fRMde/STSx0ffPihDWlm66FvOjkdERJSKMUgmsiEm6pnEe/lKtldaSdbcIeq2gOhIkf071eWQRt0kyv95/e6n927K7YM/qgNOOYU0ERFR6sUgmchOabNkl4zBedRl/6hn+tszZH9JAgLS6K/fZo8SERGleqmiTjIRERERUXJikExEREREZIJBMhERERGRCeYkEyVCjF+ADJ68Vn/ZWrk4S9AmNiaW/U9EROSCGCQTJYLO21vuZsttV7k4SyKfRci1f8IlX0wM3wMiIiIXwyCZKInLxVly69IZCbu6UOJiGSQTERG5GgbJRIngExsjrX6cpy6vad1L4nz9LJaLs+TJ3RvseyIiIhfFA/eIEsEnLlYabVqqTrhMRERE7oVBMhERERGRCQbJREREREQmGCQTEREREZlgkExEREREZIJBMhERERGRCQbJREREREQmWCeZKBEwFfWIz1boLxMREZF7YZBMlMhpqa+/VIh9R0RE5KaYbkFEREREZIIjyUSJnJa6yfpF6vKGpp0STEtNREREHj6SHBcXJydPnpT79+87Z42IUgFMRd3i52/VidNSExERuR+Hg+T+/fvLggUL9AFyrVq1pGLFipI3b17ZtWtXUqwjEREREZFrB8k//PCDlCtXTl1et26dhIaGyp9//ikDBgyQ//3vfw4ta8+ePdKsWTPJnTu3eHl5ydq1a43u/+CDD9TthqdGjRoZtbl375689957kjFjRsmcObN06dJFnjx5YtTm1KlTUqNGDQkMDFTB/KRJkxKsy+rVq6V48eKqTZkyZWTjxo0OvRYiIiIi8uAg+c6dO5IzZ051GYHk22+/LUWLFpXOnTvL6dOnHVrW06dPVcA9Z84ci20QFIeHh+tPK1Y8L7ulQYB89uxZ2bp1q6xfv14F3t27d9ff/+jRI2nQoIGEhITI8ePHZfLkyTJ69Gj5+uuv9W0OHDggbdu2VQH2iRMnpGXLlup05swZh14PEREREXnogXs5cuSQc+fOSa5cuWTTpk0yb948dXtERIT4+Pg4tKzGjRurkzUBAQH6oNzUH3/8odbh6NGj8vLLL6vbZs2aJW+88YZ8+eWXaoR62bJlEh0dLQsXLhR/f38pVaqUyqGeOnWqPpieMWOGCsYHDRqkro8bN04F3bNnz5b58+c79JqIiIiIyAOD5E6dOsk777yjgmSkP9SrV0/dfvjwYZWu4GzIcw4ODpYsWbLI66+/Lp999plkzZpV3Xfw4EGVYqEFyID18fb2Vuvz5ptvqjY1a9ZUAbKmYcOGMnHiRHWwIZaLNgMHDjR6XrQxTf8wFBUVpU6GI9YQExNjdE6Jl1R9iVx6f38/8fPGH0C8zfZ+Pl4SGBhg1N7wcbiMf5baOrJci229Ra0z1j0x/cHt0rnYn+xLV8Vtk33pimJcKDZyZB0cDpKRqlC6dGm5evWqSrXASC9gFHno0KHiTBjdbdWqlRQoUEAuXbokn376qRp5RlCL57tx44YKoA35+vpKUFCQug9wjsebjoZr9yFIxrl2m2EbbRnmjB8/XsaMGZPg9p07d0ratGnVSDQ5R1L05eA+Pf69ZPk91quaS7pUnWjU3icyUn93w8w3JS4w0GJbR5ZrURZvkSI91N4TnBKL26VzsT/Zl66K2yb70hVtdYHYCJkPSVon+a233lLnkQaBQseOHcXZ2rRpo7+Mg+nKli0rhQoVUqPLdevWlZQ0bNgwo9FnjCTjoMA6deqoUez69euLnx9r577orz38QTm7L3Gwaafe/SWkYTfJkC23zfbhF07IoRXTpXrnERKct7C6zSs+Tk6NWKguX3maV3TPfCy2dWS5ljy+c13CNn8ji+ZMT/CjLyX70lOxP9mXrorbJvvSFcW40HeQtuc/SYJk7O794osvVK7uzZs35cKFC1KwYEEZMWKE5M+fXx38llTwPNmyZZOLFy+qIBm5yrdu3TJqExsbqypeaHnMOMd6GtKu22pjKRcaMIKujaIb0t58nKf0huAunN2X2AsRHR0jMfEisXYcuxoTp5PIyCjj9t7ecqlgafvaOrJcS23jRa0z1v1F+oLbpXOxP9mXrorbJvvSFfm5QGzkyPM7XN3i888/l8WLF6syaoZ5vkjB+PbbbyUpXbt2Te7evavyoaFatWry4MEDVbVCs2PHDomPj5eqVavq26DihWEOCn7NFCtWTKVaaG22b99u9Fxog9uJiIiIyPM4HCR/9913qnwaSq8ZVrNAKTfUS3YE6hmj0gRO2m5wXL5y5Yq6D9UmDh06JJcvX1ZBbIsWLaRw4cLqoDooUaKEylvu1q2bHDlyRPbv3y99+vRRaRqobAHt2rVTwTxGuFEqbtWqVaqahWGqRL9+/VSVjClTpqjXgLzrY8eOqWURWZqWuuGv/6dOuExEREQeHiT/888/KlA1hdFbR49aRCBaoUIFdQIErrg8cuRIFYBjEpDmzZurOswIcitVqiR79+41SnNAiTdU1UD6BUq/vfbaa0Y1kDNlyiRbtmxRATge//HHH6vlG9ZSfvXVV2X58uXqcQj2MWEKKltgdJzIHExF/c73s9SJ01ITERG5H4dzkkuWLKkCVUzOYQiBpRbs2qt27dqi0+ks3r9582aby0AlCwS41uCAP6yzNajUgRMRERERkcNBMkZhUckCI8oYPV6zZo2cP39epWFgxjsiIiIiIo9Lt0Be8Lp162Tbtm2SLl06FTSjbituQ2kPIiIiIqLULlF1kmvUqOESBaGJiIiIiFxiJJmIiIiIyN3ZNZKMesJeXl52LRATeRARERERuX2QPH369KRfE6JUJMbPXyYNmae/TERERB4YJKOaBRH9R+ftI+eLV2KXEBERuSmHc5I3btxotn4xJuz49ddfnbVeRERERESpJ0geOnSoxMXFJbgdNZNxH5En8ImNlTrbV6sTLhMREZGHl4D766+/1Kx7pjA19MWLF521XkQuzScuRtovnawu73+tqcT5JqqaIhEREbnLSHKmTJnk77//TnA7AmRMLkJERERE5JEz7vXv318uXbpkFCB//PHH0rx5c2evHxERERGR6wfJkyZNUiPGSK8oUKCAOpUoUUKyZs0qX375ZdKsJRERERFRMvJNTLrFgQMH1LTUv//+u6RJk0bKli0rNWvWTJo1JCIiIiJKZok62giz7zVo0ECdiIiIiIg8MkieOXOmdO/eXQIDA9Vlaz766CNnrRsRERERkesGydOmTZP33ntPBcm4bG2EmUEyeYJYXz+Z0X+q/jIRERF5YJAcGhpq9jKRp4r38ZVT5V5L6dUgIiIiV6luMXbsWImIiEhw+7Nnz9R9REREREQeFySPGTNGnjx5kuB2BM64j8gTYCrq6vvWqxOnpSYiInI/Dle30Ol0KvfYFMrBBQUFOWu9iFx+WurOC57vOTlauS6npSYiIvLUIDlLliwqOMapaNGiRoFyXFycGl3u2bNnUq0nEREREZHrBcnTp09Xo8idO3dWaRWYVETj7+8v+fPnl2rVqiXVehIRERERuV6Q3LFjR4mNjVUjyK+//rrkzZs3adeMiIiIiCg1HLjn6+srvXr1kvj4eKc8+Z49e6RZs2aSO3duFXyvXbtWf19MTIwMGTJEypQpI+nSpVNtOnToINevXzdaBkawtTQQ7TRhwgSjNqdOnZIaNWqoOs8I7idNmpRgXVavXi3FixdXbfCcGzdudMprJCIiIiIPqG5RpUoVOXHihFOe/OnTp1KuXDmZM2eO2WoZv/32m4wYMUKdr1mzRs6fPy/NmzdP0Bal58LDw/Wnvn376u979OiRmj47JCREjh8/LpMnT5bRo0fL119/rW9z4MABadu2rXTp0kW9tpYtW6rTmTNnnPI6iYiIiMjNq1t8+OGH8vHHH8u1a9ekUqVKapTXUNmyZe1eVuPGjdXJHOQ8b9261ei22bNnqyD9ypUrki9fPv3tGTJkkJw5c5pdzrJlyyQ6OloWLlyocqdLlSolJ0+elKlTp6qptmHGjBnSqFEjGTRokLo+btw49dx4vvnz59v9eoiIiIjIQ4PkNm3aqHPD6aeR4qCVhkOli6Ty8OFD9RyZM2c2uh3pFQhsETi3a9dOBgwYoFJD4ODBg1KzZk0VIGsaNmwoEydOlPv376uqHWgzcOBAo2WijWH6h6moqCh1Mhyx1tJEDM8p8ZKqL7GN+vv7iZ83/gBspw75+XhJYGCAcXtfH/nqw8/1l7XbzbZ1ZLmW2nrjAFk/te6J6Q9ul87F/mRfuipum+xLVxTjQrGRI+vgpUN064CwsDCr9yOtITEQ/P70008qzcGcyMhIqV69usobxuiwBiPCFStWVDWakTYxbNgw6dSpk7odkGpRoEAB+eqrr/SPOXfunBpRxnmJEiVUAL1kyRKVcqGZO3euquJx8+ZNs+uDlA1zk6csX75c0qZNm6g+ICIiIqKkg3ReDKhi4DVjxozOHUlObBD8olH/O++8o0ar582bZ3Sf4QgwUj0Q8Pbo0UPGjx8vAQEBSbZOCMYNnxsjyTgosE6dOnL48GGpX7+++Pn5JdnzewK870h7cXZfhoaGSqfe/SWkYTfJkC23zfbhF07IoRXTpXrnERKct3CKtH1857qEbf5GFs2Zrn70uUpfeir2J/vSVXHbZF+6ohgX+g7S9vzbw+EgWYNRWOQGI9/XkLkD65wRIGMEe8eOHTaj/qpVq6pSdZcvX5ZixYqpXGXT0WDtupbHbKmNpTxnQABuLgjX3nycp/SG4C6c3Zc+Pj4SHR0jMfEisXYcuxoTp5PIyCij9t5xsVLxt13q8m8Va0u8j6/Fto4s12LbeFHrjHV/kb7gdulc7E/2pavitsm+dEV+LhAbOfL8DgfJf//9t7z55pty+vRpfS4yaDPwOTMnWQuQ//rrL9m5c6dkzZrV5mNwUJ63t7cEBwer65jg5H//+59altYx+DWDABr5yFqb7du3S//+/fXLQRtOjkKW+MbGSK+5n6rLvebvluh/g2QiIiLy0BJw/fr1U7t7b926pXJvz549q+odv/zyy7Jr1/ORNXthKmsEtThpu8FxGSPUCGrfeustOXbsmMpBRvB948YNddJGr3HAHWYC/P3331XwjnY4aK99+/b6ABh5J0jBQHk3rOuqVatUNQvDVAm8pk2bNsmUKVPkzz//VPnGeN4+ffo42j1ERERE5AYcHv5CYIq0h2zZsqkRW5xee+01lQOMiheO1FBGIIocXo0WuGJ2PwSqv/zyi7pevnx5o8dhVLl27doq3WHlypWqLSpNIHhHkGwYAKOU3JYtW6R3796qZB3We+TIkfryb/Dqq6+qA+6GDx8un376qRQpUkRVtihdurSj3UNEREREnhgkY0QXdYkBASdmwEPqAg7ow2QfjkCga624hq3CG6hqcejQIZvPgwP69u7da7XN22+/rU5ERERERA4HyRhdRXoDRm1xkBymeEY6A2awK1iwIHuUiIiIiDwvSEZKAqaT1qaDbtq0qdSoUUMdVId8XyIiIiIijwuSMROdpnDhwupAt3v37qkD5bQKF0REREREqZlT6lZhtjuilHb79m27ioSj5nZsTOwLPVecj58s7DJSf5mIiIjcC4u7ktsEyO07dZV7jyNsto18FiHX/gmXfC8wh3ycr6/sf61poh9PREREro1BMrkFjCAjQM5erbWkC8phte2tS2ck7OpCiYtNfJBMRERE7o1BMrkVBMgZg/NYbfPk7o0Xfh5MS136zPPyg2dKv6KflpqIiIg8aMY91CO+f/++vqJFRITtXdpE7j4tdb/pA9UJl4mIiMgDg+Q//vhDX/ZtzJgxajppIiIiIiJ3Zdc+YkwL3alTJzX9NGbB+/LLLyV9+vRm22LKZyIiIiIitw+SFy9eLKNGjZL169erWsi//vqr+PomfCjuY5BMRERERB4RJBcrVkxWrlypLnt7e8v27dslODg4qdeNiIiIiChFOHxIfnx8fNKsCRERERGRi0hU3apLly7J9OnT1QF9ULJkSenXr58UKlTI2etHREREROT6QfLmzZulefPm6mC+6tWrq9v2798vpUqVknXr1kn9+vWTYj2JXAqmol7afpD+MhEREXl4kDx06FAZMGCATJgwIcHtQ4YMYZBMHgHTUu+s+3ZKrwYRERGlZJ1kQ0ix6NKlS4LbO3fuLOfOnXPWehERERERpZ4gOXv27HLy5MkEt+M2VrwgT+EVHyfF/jyuTrhMREREHp5u0a1bN+nevbv8/fff8uqrr+pzkidOnCgDBw5MinUkcjl+MdEyeGIvdbnX/N0SHZAmpVeJiIiIUjJIHjFihGTIkEGmTJkiw4YNU7flzp1bRo8eLR999JEz142IiIiIKHUEyZhVDwfu4fT48WN1G4JmIiIiIiKPrpOsYXBMRERERO7I4QP3iIiIiIjcHYNkIiIiIiJXCpL37NkjzZo1Uwf+Idd57dq1RvfrdDoZOXKk5MqVS9KkSSP16tWTv/76y6jNvXv35L333pOMGTNK5syZVQ3nJ0+eGLU5deqU1KhRQwIDAyVv3rwyadKkBOuyevVqKV68uGpTpkwZ2bhxYxK9aiIiIiJyqyA5JiZG6tatmyBQTaynT59KuXLlZM6cOWbvRzA7c+ZMmT9/vhw+fFjSpUsnDRs2lMjISH0bBMhnz56VrVu3yvr161XgjRJ1mkePHkmDBg0kJCREjh8/LpMnT1aVOL7++mt9mwMHDkjbtm1VgH3ixAlp2bKlOp05c8Ypr5PcT5yPr3z/Tl91wmUiIiJyLw59u/v5+alRWWdp3LixOpmDUeTp06fL8OHDpUWLFuq27777TnLkyKFGnNu0aaNm/9u0aZMcPXpUXn75ZdVm1qxZ8sYbb8iXX36pRqiXLVsm0dHRsnDhQvH395dSpUqpiU+mTp2qD6ZnzJghjRo1kkGDBqnr48aNU0H37NmzVYBOZCrO1082N36fHUNEROSmHB4Ca9++vSxYsEAmTJggSSk0NFRu3LihUiw0mTJlkqpVq8rBgwdVkIxzpFhoATKgvbe3txp5fvPNN1WbmjVrqgBZg9FoTH5y//59yZIli2pjOhEK2pimfxiKiopSJ8MRa2203fCcEs+RvoyLixN/fz/x88ZGHW+1rZ+PlwQGBtjV1tH2SdbWW9Trw+tMzLbF7dK52J/sS1fFbZN96YpiXCg2cmQdHA6SY2Nj1ajstm3bpFKlSioFwhBGaJ0BATJg5NgQrmv34dx0KmxfX18JCgoyalOgQIEEy9DuQ5CMc2vPY8748eNlzJgxCW7fuXOnpE2bVo1Ek3PY25eD+/T495Ll902pmku6VJ1oX1tL7ePiJPPff6uLDwoWFPHxcXzZjrTN4i1SpIfae4JTYnG7dC72J/vSVXHbZF+6oq0uEBtFREQkXZCMPN2KFSuqyxcuXDC6DwffeQrMNmg4+oyRZBwUWKdOHTWKXb9+fZWeQi/2aw9/UPb0JfY8dOrdX0IadpMM2XJbbRt+4YQcWjFdqnceIcF5C9tcD3Pt/aOeyaxBrdXlvvN36qeldmTZjrR9fOe6hG3+RhbNmZ7gR5+z+5LYn8mJ2yb701Vx23TPvtT2/CdJkIyR0uSQM2dOdX7z5k1V3UKD6+XLl9e3uXXrVoKRblS80B6PczzGkHbdVhvtfnMCAgLUyZT25uM8pTcEd2FPX/r4+Eh0dIzExIvE2jgeNSZOJ5GRUXa1tdTe2+BxuE273ZFlO9Q2XtTrw+t8ke2K26VzsT/Zl66K2yb70hX5uUBs5MjzJ7oE3MWLF2Xz5s3y7Nkz/YF2zoTRMgSp27dvN4r+MUpbrVo1dR3nDx48UFUrNDt27JD4+HiVu6y1QcULwxwU/JopVqyYSrXQ2hg+j9ZGex4iIiIi8iwOB8l3795VZeCKFi2qqkiEh4er21E+7eOPP3ZoWahnjEoTOGm7zHH5ypUrKnWjf//+8tlnn8kvv/wip0+flg4dOqiKFSjPBiVKlFBVKbp16yZHjhyR/fv3S58+fdRBfWgH7dq1UwftYf1QKm7VqlWqmoVhqkS/fv1UlYwpU6bIn3/+qUrEHTt2TC2LiIiIiDyPw0HygAED1FA1AlkcoKZ59913VaDpCASiFSpUUCdA4IrLmEAEBg8eLH379lWl2ipXrqyCajwHJvzQoMQbJgFB4I6g/bXXXjOqgYyKGFu2bFEBOA40RCCP5RvWUn711Vdl+fLl6nGo2/zDDz+oyhalS5d2tHuIiIiIyA04nJOMgBNpFnny5DG6vUiRIhIWFubQsmrXrm01TQOjyWPHjlUnS1DJAgGuNWXLlpW9e/dabfP222+rExERERGRd2JmyTMcQdbgYDlzB7IREREREbl9kFyjRg01853haC8OlMMU0ih/RuQJMBX1zy26qhOnpSYiInI/DqdbIBhG/i/yiTHdM/KGcUAcRpJx4ByRsyGfHKXPrEGqT2xMbLJOS/1Ly//y2omIiMjDg2QczIZJRGbPni0ZMmRQB9O1atVKevfubVTPmOhF3blzR51jkhDUCLYm8lmEXPsnXPK5wJSXRERE5IFBslYx4n//+5/z14bIwOPHj9V5tiotxD+T8fTjpm5dOiNhVxdKXGzyBMle8fGSKzxUXQ7PVUB03okuOU5ERETuEiTfv39fFixYIH/88Ye6XrJkSenUqZOqNEHkbOmyBEuabMbVVEw9uXsjWTveLyZKxg1vqy73mr9bPy01ERERuQeHh78we13+/Pll5syZKljGCZcxQx7uIyIiIiLyuJFk5B5j4pB58+bpD6aKi4uTDz/8UN2HmfGIiIiIiDxqJPnixYtq1jrDagO4jNnycB8RERERkccFyRUrVtTnIhvCbZjSmYiIiIjII9ItTp06pb/80UcfSb9+/dSo8SuvvKJuO3TokMyZM0cmTJiQdGtKRERERORKQXL58uXVzHo6nU5/GyYRMdWuXTuVr0xERERE5PZBMmY8I6L/YCrqTY3a6y8TERGRe7Hr2z0kJCTp14QoFcG01Kvf/SilV4OIiIiSSKKGwK5fvy779u2TW7duSXx8vNF9yFkmIiIiIvKoIHnx4sXSo0cP8ff3l6xZs6pcZQ0uM0gmT4BpqYPuPZ/l715QTk5LTURE5OlB8ogRI2TkyJEybNgw8fZ2uIIckVvAtNSTBrVUlzktNRERkftxOMqNiIiQNm3aMEAmIiIiIrflcJDcpUsXWb16ddKsDRERERFRaky3GD9+vDRt2lQ2bdokZcqUET8/P6P7p06d6sz1IyIiIiJKHUHy5s2bpVixYuq66YF7REREREQeFyRPmTJFFi5cKB988EHSrBERmRUTHS1hYWF2907GjBkle/bs7E0iIqLkCJIDAgKkevXqiXkuIkqkqCcP5XLo39L/09Hqb9AeQRnSytJF3zJQJiIiSo4guV+/fjJr1iyZOXNmYp6PyC3Ee/vIjtff0l9OajFRzyTey1eyvdJKsua2PQPm03s35fbBH+XRo0cMkomIiJKjusWRI0dkyZIlUrBgQWnWrJm0atXK6ORs+fPnV7nOpqfevXur+2vXrp3gvp49exot48qVK9KkSRNJmzatBAcHy6BBgyQ2Ntaoza5du6RixYpqlK5w4cJq0hQiS2L9/GXZ+4PVCZeTS9os2SVjcB6bp3RBOfjmERERJedIcubMmZMkGLbk6NGjEhcXp79+5swZqV+/vrz99tv627p16yZjx47VX0cwrMFjESDnzJlTDhw4IOHh4dKhQwdVleOLL75QbUJDQ1UbBNfLli2T7du3S9euXSVXrlzSsGHDZHutRERERJRKg+RFixZJcjI98GjChAlSqFAhqVWrllFQjCDYnC1btsi5c+dk27ZtkiNHDilfvryMGzdOhgwZIqNHj1bTa8+fP18KFCigDkqEEiVKyL59+2TatGkMksk8nU7SP36gLj7JkBmlXdhTREREnhwkp6To6GhZunSpDBw40KjcHEZ/cTsCZaSAYOpsbTT54MGDqp4zAmQNRod79eolZ8+elQoVKqg29erVM3outOnfv7/FdYmKilInDXI/ISYmxuicEk/bg+DrjQ013mpbPx8vCQwMED8nt7XU3j/6mczo93wvQ9/5OyU6IE2SrofD6+wt4u/vp/oQ2yK3S+dif7IvXRW3TfalK4pxodjIkXXw0ul0OkcWjhFXa/WQ//77b0kq33//vbRr107lGOfOnVvd9vXXX0tISIi6furUKTVCXKVKFVmzZo26v3v37qpsFmo7G06tnS5dOtm4caM0btxYihYtKp06dZJhw4bp2+A+pGCgbZo0zwMgQxiFHjNmTILbly9fbpTuQe7JJzJSmrZpoy6vX7lS4gIDU3qViIiIyAbEdYglHz58qEqlOnUk2XR0FRH5iRMn1Ax8OCAuKS1YsEAFtVqArAXBGowYI4+4bt26cunSJZWWkVQQUGNE23AkOW/evFKnTh05fPiwyps2nY2QHHPx4kW5cOGC/HgpXtIE/feemxN+4YQcWjFdqnceIcF5CzutraX2/lHPpOm/929+kEM/kpxU6+HoOj++c13CNn8ji+ZMVz9s8Xe6detWbpdOwv50Hvalc7E/2ZeuKMaFvoO0Pf9JVgLOnDlz5sixY8ckqWA0GHnF2gixJVWrVtUHWAiSkYKBihyGbt68qc61PGaca7cZtsEvDHOjyIAqGObq1WpvPs5TekNI7Xx8npdWi40XibVRiCUmTieRkVES4+S2ltp7GzwOt2m3J9V6OLzO8UhPilF9aLgdcrt0LvYn+9JVcdtkX7oiPxeIjRx5fodLwFmCEd4ff/xRkgoOGET5NqRAWHPy5El1jhFlqFatmpw+fVpu3bqlb4NfMwiAS5YsqW+DihaG0Aa3ExEREZHncVqQ/MMPP0hQUJAkhfj4eBUkd+zYUXx9/xv8RkoFKlUcP35cLl++LL/88osq71azZk0pW7asatOgQQMVDL///vvy+++/q9zk4cOHqzrL2kgwSr8hl3rw4MHy559/yty5c1X+84ABA5Lk9RARERGRa3M43QLVIAwP3MNxfzdu3JDbt2+r4DIpIM0CB+t17tzZ6HaUb8N906dPl6dPn6qc4NatW6sgWIPdzevXr1fVLDAyjAP2EGwb1lVGzuaGDRtUUDxjxgzJkyePfPvttyz/RkREROShHA6SW7ZsaXTd29tb1TLGzHfFixeXpIDRYHNFOBAU79692+bjUf0C1SqswfrjAEQie2Aq6v3VmyTbtNRERETk4kHyqFGjkmZNiFIRTEW9sCv/FoiIiNyV03KSiYiIiIg8biQZaRXWJhEB3B8bG+uM9SJybTqd+EdHqovR/oGclpqIiMhTg+SffvrJ4n2Y1nnmzJmqCgWRJ0CAPK9nLXW51/zd+slEiIiIyMOC5BYtWiS47fz58zJ06FBZt26dvPfee0YVI4iIiIiIPCon+fr169KtWzc1DTTSKzCBx5IlS1QVCSIiIiIijwqSHz58KEOGDJHChQvL2bNn1Sx1GEUuXbp00q0hEREREZGrpltMmjRJJk6cKDlz5pQVK1aYTb8gIiIiIvKoIBm5x2nSpFGjyEitwMmcNWvWOHP9iIiIiIhcN0ju0KGDzRJwREREREQeFSQvXrw4adeEKBWJ9/aWYy+/rr9MREREHj4tNRFhWuoAmdd7AruCiIjITXEIjIiIiIjIBINkIiIiIiITDJKJEsE/6pks6FRFnXCZiIiI3AuDZCIiIiIiEwySiYiIiIhMMEgmIiIiIjLBIJmIiIiIyASDZCIiIiIiEwySiYiIiIhMcMY9okTAVNSnylbXXyYiIiL3wiCZKJHTUs8YMI19R0RE5KY4BEZEREREZIJBMhERERFRagqSR48eLV5eXkan4sWL6++PjIyU3r17S9asWSV9+vTSunVruXnzptEyrly5Ik2aNJG0adNKcHCwDBo0SGJjY43a7Nq1SypWrCgBAQFSuHBhWbx4cbK9RkqdMBX13B411YnTUhMREbkflw6SoVSpUhIeHq4/7du3T3/fgAEDZN26dbJ69WrZvXu3XL9+XVq1aqW/Py4uTgXI0dHRcuDAAVmyZIkKgEeOHKlvExoaqtrUqVNHTp48Kf3795euXbvK5s2bk/21UuoSEB2pTkREROR+XP7APV9fX8mZM2eC2x8+fCgLFiyQ5cuXy+uvv65uW7RokZQoUUIOHTokr7zyimzZskXOnTsn27Ztkxw5ckj58uVl3LhxMmTIEDVK7e/vL/Pnz5cCBQrIlClT1DLweATi06ZNk4YNG1pcr6ioKHXSPHr0SJ3HxMQYnVPi4UeO2ga8saHGW23r5+MlgYEB4ufktpbaGz4Ol/EvKdfD4XX2FvH391N9iG2R26VzsT/Zl66K2yb70hXFuFBs5Mg6eOl0Op24KASykydPlkyZMklgYKBUq1ZNxo8fL/ny5ZMdO3ZI3bp15f79+5I5c2b9Y0JCQtRoMEaZMWL8yy+/qBFiw5HjggULym+//SYVKlSQmjVrqlSL6dOn69sg2MYyEIhbW7cxY8YkuB1BO1I7yL35REZK0zZt1OX1K1dKXGBgSq8SERER2RARESHt2rVTMV7GjBlT70hy1apVVXpEsWLFVKoFgtIaNWrImTNn5MaNG2ok2DBABowY4z7AOa6b3q/dZ60NRoafPXsmadKkMbtuw4YNk4EDB+qvo33evHlV2sbhw4elfv364ufn56Se8EwXL16UCxcuyI+X4iVNUG6rbcMvnJBDK6ZL9c4jJDhvYae1tdQeechN/71/84McEh2QJknXw9F1fnznuoRt/kYWzZmu9pTgl/PWrVu5XToJ+9N52JfOxf5kX7qiGBf6DtL2/NvDpYPkxo0b6y+XLVtWBc0YKf7+++8tBq/JBQf54WRKe/NxntIbQmrn4+OjzmPjRWJtpM/HxOkkMjJKYpzc1lJ7b4PH4Tbt9qRaD4fXOV4kOjpG9aHhdsjt0rnYn+xLV8Vtk33pivxcIDZy5Pld/sA9Qxg1Llq0qBphRJ4yDsh78OCBURtUt9BymHFuWu1Cu26rDYbgUzoQJyIiIqKUkaqC5CdPnsilS5ckV65cUqlSJfVrYPv27fr7z58/r0q+IXcZcH769Gm5deuWvg2G+xEAlyxZUt/GcBlaG20ZRObovLzkz2IV1QmXiYiIyL24dLrFJ598Is2aNVMpFijvNmrUKLX7uG3btupgvi5duqi84KCgIBX49u3bVwW3qGwBDRo0UMHw+++/L5MmTVL5x8OHD1e1lbVUiZ49e8rs2bNl8ODB0rlzZ3VAINI5NmzYkMKvnlxZjH+gTB46P6VXg4iIiDwxSL527ZoKiO/evSvZs2eX1157TZV3w2VAmTZvb281iQjKsaFk29y5c/WPR0C9fv166dWrlwqe06VLJx07dpSxY8fq2+CgJgTEqIYxY8YMyZMnj3z77bdWy78RERERkXtz6SB55cqVVu9HWbg5c+aokyUYhd64caPV5dSuXVtOnDiR6PUkIiIiIvfi0kEykatCCbhJn7RQlwd/+bO+BJwriYmOlrCwMKOJWVAnXKsaYgjpStoeGiIiImKQTJRoGZ4YV1ZxJVFPHsrl0L+l/6ejVf49Zt8b3KeHdOrdX5WGMxWUIa0sXfQtA2UiIqJ/cSSZyA3FRD2TeC9fyfZKK8maO0RNUw0hDbupGsqGnt67KbcP/qgKrHM0mYiI6DkGyURuLG2W7JIxOI/4CiLjG5IhW26zk5HcTpG1IyIicl2pqk4yEREREVFyYJBMRERERGSC6RaUrG7fvq1yX+1x9erVJF8fIiIiInMYJFOyBsjtO3WVe48j7Gqvi4uVEYP6S3RMjLhagTVMRR2av4T+MhEREbkXBsmUbDCCjAA5e7XWki4oh832D8LOqfO42IQly1xhWurPRi1J6dUgIiKiJMIgmZIdAmRUXLAl6sHNZFkfIiIiIlM8cI+IiIiIyASDZKJE8I+KlImftFAnXCYiIiL3wnQLokTRSba74frLRERE5F44kkxEREREZIJBMhERERGRCQbJREREREQmGCQTEREREZlgkExEREREZILVLYgSxUv+yV1Af5mIiIjcC4NkokSIDgiUkZ+vYt8RERG5KaZbEBERERGZYJBMRERERGSCQTJRImAq6rH/e1edOC01ERGR+2FOMlGi6OSl66H6y0REROReXHokefz48VK5cmXJkCGDBAcHS8uWLeX8+fNGbWrXri1eXl5Gp549exq1uXLlijRp0kTSpk2rljNo0CCJjY01arNr1y6pWLGiBAQESOHChWXx4sXJ8hqJiIiIyPW49Ejy7t27pXfv3ipQRlD76aefSoMGDeTcuXOSLl06fbtu3brJ2LFj9dcRDGvi4uJUgJwzZ045cOCAhIeHS4cOHcTPz0+++OIL1SY0NFS1QXC9bNky2b59u3Tt2lVy5colDRs2TOZXTZT8YqKjJSwszK62GTNmlOzZsyf5OhEREaUklw6SN23aZHQdo7sYCT5+/LjUrFnTKChGEGzOli1bVFC9bds2yZEjh5QvX17GjRsnQ4YMkdGjR4u/v7/Mnz9fChQoIFOmTFGPKVGihOzbt0+mTZvGIJncXtSTh3I59G/p/+lotSfFlqAMaWXpom8ZKBMRkVtz6SDZ1MOHD9V5UFCQ0e0Y/V26dKkKlJs1ayYjRozQjyYfPHhQypQpowJkDUaHe/XqJWfPnpUKFSqoNvXq1TNaJtr079/f4rpERUWpk+bRo0fqPCYmxuicxGhU39/fT/y8seHF2+waP5/nk3TY0x5tAwMDnN7WUnvDx+Ey/iXlerzoOptbb73YSPFPk05yV28tQbnyWV3u0/u35M6Rn+X+/fuSOXNm8VT8O2dfuipum+xLVxTjQrGRI+vgpdPpUsVRR/Hx8dK8eXN58OCBGuXVfP311xISEiK5c+eWU6dOqRHiKlWqyJo1a9T93bt3V7uRN2/erH9MRESEStfYuHGjNG7cWIoWLSqdOnWSYcOG6dvgPqRgoG2aNGkSrA9GoceMGZPg9uXLlxule5B78omMlKZt2qjL61eulLjAwJReJSIiIrIBcV27du3UwCvSB91iJBm5yWfOnDEKkLUgWIMRY+QR161bVy5duiSFChVKsvVBQD1w4ECjkeS8efNKnTp15PDhw1K/fn2V9+wJ7ty5I48fP7bZ7urVq/K/cZOkYNOekiFbbtvLvXRSOrycU5adeSJZXipstW34hRNyaMV0qd55hATndV5bS+39oiLllazPU3w2PcgpMQGBSboeL7rOGEFukOWWbLkfLLEmx+s6suzHd65L2OZvZNGc6SpFyVNhJGLr1q0e9XeeVNiX7E9XxW3TPftS2/Nvj1QRJPfp00fWr18ve/bskTx58lhtW7VqVXV+8eJFFSQjBePIkSNGbW7evKnOtTxmnGu3GbbBLwxzo8iA3E1z+Zvam4/zlN4QksPt27flg2495d7jCJttI59FyLV/wiVHVIyksaOwSkzc850cMfGSILAz1zYyMsrpbS21jw1IK0O+/CXZ1sMZ66zWW7wTPN6h9YgXiY6OER8fH4/Yvm3xlL/z5MC+ZH+6Km6b7tWXjjy/SwfJyATp27ev/PTTT6pEmz0jVydPnlTnGFGGatWqyeeffy63bt1SB/0Bfs0gAC5ZsqS+DdIrDKENbifbv8gQIGev1lrSBf2X923OrUtnJOzqQomLTfmcJCIiIqJUGyQjxQI5vj///LOqlXzjxg11e6ZMmdQIL1IqcP8bb7whWbNmVTnJAwYMUJUvypYtq9qiZByC4ffff18mTZqkljF8+HC1bG0kGKXfZs+eLYMHD5bOnTvLjh075Pvvv5cNGzak6OtPTRAgZwy2Psr/5O7z94+IiIjI1bn0ZCLz5s1TidWYMAQjw9pp1apV6n6Ub0NpNwTCxYsXl48//lhat24t69at0y8Du4WRqoFzjAy3b99e1Uk2rKuMEWoExBg9LleunCoF9+2337L8G1nkFx0pw8d0VCdcJiIiIvfi0iPJtgpv4EA5TDhiC6pfmKZTmEIgfuLECYfXkTyTl04nBS7/ob9MRERE7sWlR5KJiIiIiFICg2QiIiIiIhMMkomIiIiITDBIJiIiIiIywSCZiIiIiCg1VbcgcmWP02dO6VUgIiKiJMIgmSgRogPSSP9ZW9h3REREborpFkREREREJjiSTGbdvn1bHj16ZLN3wsLCJDYmlr1IREREboVBMpkNkNt36ir3HkfY7J3IZxFy7Z9wyRcT41E9iamo+0/try5PHzhdYvwDU3qViIiIyIkYJFMCGEFGgJy9WmtJF5TDag/dunRGwq4ulLhYzwqSMRV18fO/6S8TERGRe2GQTBYhQM4YnMdqDz25e4M9SERERG6HQTIROSQmOlrlotsjY8aMkj17dvYwERGlOgySichuUU8eyuXQv6X/p6MlICDAZvugDGll6aJvGSgTEVGqwyCZiOwWE/VM4r18JdsrrSRr7hCrbZ/euym3D/6octw5mkxERKkNg2QicljaLNlt5qvDdQdSM4DpGURE5CoYJBMlUhTLvjk1NQOYnkFERK6CQTJRIqel/vCrPew7J6VmANMziIjIlTBIJiKXSM1wND0jOjpa/P397WrLNA4iInIUg2QiSnXpGShD98+VMMkTUkB8/Wx/jDGNg4iIHMUgmSgRfGOipPfsoerynD4TJNbPvpxbck56BmZ6/PvyQslSpQWrbBARUZJgkEyUCN7x8VL21H79ZUre9Axtpkd7UzluO23tiIjIUzBIJiK358gsgcAcZiIiYpBMRG6NpeiIiCgxGCSbmDNnjkyePFlu3Lgh5cqVk1mzZkmVKlXEHdy+fVvNfmYLRtxiY2KTZZ2IXLEU3fXdK+T06dMSEmK+fVxcnDoPDQ2VLFmycEZBIiI3xCDZwKpVq2TgwIEyf/58qVq1qkyfPl0aNmwo58+fl+DgYEntAXL7Tl3l3uMIm20jn0XItX/CJV9MTLKsG1FysDd/2Z6RZ39/Pxncp4d06t1f/CVeJn4+VrJmzerUsnWOtHW0PdNJiIhsY5BsYOrUqdKtWzfp1KmTuo5gecOGDbJw4UIZOvR5JYPUODKsjQ7fuvdIctV8V9IF5bBZOSDs6kKJi2WQTJ7HnpFnP+/n5xlL15WDK6ZL148+cWrZOkdL3DnaPr2/j8sE9j4+PvpRee2yK/8IcORzNynX2dJ6GO7l0PqTP4qIEodBssGH2fHjx2XYsGH6zvH29pZ69erJwYMHE3RcVFSUOmkePnyozu/duycRERFy9+5d8fPzk6SC5fcdOEgePLU9MgzRzyLl+s2bku3ZY5HoDFbbesfjg91XIm9fk0eWv7OUyHvXk6QtPLsXLhERmeTZnX/kkVfKrYe59v7RkaJ9PT365y+J/neK6qRajxddZx9vkYgAb3l4/aLEmRTjcNV1Tun18I6NEok2//cVj/6M8JbYZ4/ENyBQ0hd9RTIGWQ9wHoSHie76dUlTuLJT2zra/vHtG3J271rp2X+w+AVY/4yKi46V8OtXJddLIeKj/TJwQlvD9iEFCknfbh9Il979JdrCD3NHl53Bz1eGDflEpcI40/3792X8pCnyONr2AEJSrrO19fD39ZMPO79v1J9J1R/uDj848H1+8uRJqz/g6MX7MlOmTJI5c2ZJDo8fP1bnOp3OZlsvnT2tPMD169flpZdekgMHDki1atX0tw8ePFh2794thw8fNmo/evRoGTNmTAqsKRERERG9iKtXr0qePNZT8DiSnEgYcUb+siY+Pl6NImP0OF++fKrzsYuLEg+7EvPmzcu+dAL2pXOxP9mXrorbJvvSFT1yoe9zjA1jNDl37tw22zJI/le2bNnULoCbN28adRCu58yZM0HHIf/QNAcRuwq0HDFsBCm9IbgL9iX70lVx22Rfuipum+xLV5TRRWIjpHfYw3ailIfAwRWVKlWS7du3G40O47ph+gURERERuT+OJBtA+kTHjh3l5ZdfVrWRUQLu6dOn+moXREREROQZGCQbePfdd1VZnZEjR6rJRMqXLy+bNm2SHDmsl0wzhBSMUaNG2SwHRezL5MTtkv3pqrhtsj9dFbdN50mtfcnqFkREREREJpiTTERERERkgkEyEREREZEJBslERERERCYYJBMRERERmWCQ7ERz5syR/PnzS2BgoFStWlWOHDnizMW7pfHjx0vlypUlQ4YMEhwcLC1btpTz588btaldu7Z4eXkZnXr27Jli6+zKMF26aV8VL15cf39kZKT07t1bsmbNKunTp5fWrVsnmECHnsPfsmlf4oT+A26X1u3Zs0eaNWumZrVCv61duzbBrFeoJJQrVy5JkyaN1KtXT/766y+jNpjF9L333lOTD2Cypi5dusiTJ088bhO11pcxMTEyZMgQKVOmjKRLl0616dChg1y/ft3m9jxhwgTxNLa2yw8++CBBPzVq1MioDbdL+/vT3GcoTpMnT04V2yaDZCdZtWqVqrOMEie//fablCtXTho2bCi3bt1y1lO4pd27d6ug49ChQ7J161b1gd+gQQNVn9pQt27dJDw8XH+aNGlSiq2zqytVqpRRX+3bt09/34ABA2TdunWyevVq1ff4Im3VqlWKrq+rOnr0qFE/YvuEt99+W9+G26Vl+BvG5yAGD8zB3/DMmTNl/vz5cvjwYRXg4TMTP+Q0CJDPnj2r+n79+vXqC7l79+7iaaz1ZUREhPrOGTFihDpfs2aNGmho3rx5grZjx4412qb79u0rnsbWdgkIig37acWKFUb3c7u0vz8N+xGnhQsXqiAYAzSpYtvUkVNUqVJF17t3b/31uLg4Xe7cuXXjx49nDzvg1q1bOmyWu3fv1t9Wq1YtXb9+/diPdhg1apSuXLlyZu978OCBzs/PT7d69Wr9bX/88Yfq74MHD7J/bcA2WKhQIV18fDy3SwdhG/vpp5/019GHOXPm1E2ePNlo+wwICNCtWLFCXT937px63NGjR/Vtfv31V52Xl5fun3/+8djt1bQvzTly5IhqFxYWpr8tJCREN23atGRYw9Tdlx07dtS1aNHC4mO4Xb7Ytom+ff31141uc+VtkyPJThAdHS3Hjx9Xuws13t7e6vrBgwed8RQe4+HDh+o8KCjI6PZly5ZJtmzZpHTp0jJs2DA1ekLmYZc1dn0VLFhQjXhcuXJF3Y5tFCP1htspUjHy5cvH7dSOv/GlS5dK586d1SgIt8sXExoaqiZsMtwWM2XKpNLUtM9MnCPFAjOgatAen60YeSbrn6PYTtF/hrALG6lWFSpUULu7Y2Nj2Y1m7Nq1S6X/FStWTHr16iV3797V38ftMvGQ2rdhwwaVNmXKVbdNzrjnBHfu3JG4uLgEM/Ph+p9//umMp/AI8fHx0r9/f6levboKhjXt2rWTkJAQFfidOnVK5d9hdyJ2K5IxBBmLFy9WH+7YZTVmzBipUaOGnDlzRgUl/v7+Cb44sZ3iPrIMeXYPHjxQ+YrcLl+ctr2Z+8zU7sM5AhVDvr6+6gc0t1fLkK6Cz8i2bduqXG7NRx99JBUrVlT9d+DAATXYgM+IqVOnOuEddR9ItUAKWoECBeTSpUvy6aefSuPGjVVw7OPjw+3yBSxZskQdf2Sa4ufK2yaDZHIZyE1GMGeYQwuGOYg4OAUH+tStW1d9gBUqVCgF1tR14cNcU7ZsWRU04wfG999/rw6OosRZsGCB6lv8UNNwuyRXgz1F77zzjjooct68eUb34ZgZw88G/GDu0aOHOng6tU0VnJTatGlj9H2DvsL3DEaX8b1DiYd8ZOzdRHGD1LJtMt3CCZAGgF+YplUCcD1nzpzOeAq316dPH3Vgzs6dOyVPnjxW2yLwg4sXLybT2qVeGDUuWrSo6itsi0gbwIioIW6n1oWFhcm2bduka9euVttxu7Sf9rlo7TMT56YHPmMXLCoL8HPVcoCM7RUHOhqOIlvaXtGfly9fduCd8zxIW8N3vPZ9w+0ycfbu3av2ANv6HHW1bZNBshPgV0+lSpVk+/btRqkDuF6tWjVnPIXbwogHAuSffvpJduzYoXZx2XLy5El1jhFlsg7lsjDijr7CNurn52e0neJDCznL3E4tW7Rokdrt36RJE26XToK/cwQbhtvio0ePVK6xti3iHD/okEuvwWcEPlu1HyRkHCDjeAT8oENupz2fo8jvNk1pIWPXrl1TOcna9w23y8TvjcN3ECphpKptM6WPHHQXK1euVEdmL168WB392r17d13mzJl1N27cSOlVc2m9evXSZcqUSbdr1y5deHi4/hQREaHuv3jxom7s2LG6Y8eO6UJDQ3U///yzrmDBgrqaNWum9Kq7pI8//lj1Jfpq//79unr16umyZcumqoZAz549dfny5dPt2LFD9Wm1atXUicxDlRr015AhQ4xu53Zp2+PHj3UnTpxQJ3zVTJ06VV3WKi5MmDBBfUbib/rUqVPqqPcCBQronj17pl9Go0aNdBUqVNAdPnxYt2/fPl2RIkV0bdu29bjN1VpfRkdH65o3b67LkyeP7uTJk0afo1FRUerxBw4cUNUDcP+lS5d0S5cu1WXPnl3XoUMHnaex1pe475NPPlHVfvAZum3bNl3FihXVdhcZGalfBrdL+//O4eHDh7q0adPq5s2bpzPl6tsmg2QnmjVrlvpC9ff3VyXhDh065MzFuyX8UZk7LVq0SN1/5coVFRAHBQWpHyGFCxfWDRo0SP3RUULvvvuuLleuXGobfOmll9R1BHQaBCAffvihLkuWLOpD680331RfpmTe5s2b1fZ4/vx5o9u5Xdq2c+dOs3/bKLGllYEbMWKELkeOHOpvu27dugn6+e7duyooTp8+vS5jxoy6Tp06qS9lT2OtLxHMWfocxePg+PHjuqpVq6oBicDAQF2JEiV0X3zxhVHg5yms9SUGZxo0aKCCNJTLRGmybt26JRjs4nZp/985fPXVV7o0adKoMo+mXH3b9MJ/KT2aTURERETkSpiTTERERERkgkEyEREREZEJBslERERERCYYJBMRERERmWCQTERERERkgkEyEREREZEJBslERERERCYYJBMRERERmWCQTERkxeXLl8XLy0tOnjzpMv30559/yiuvvCKBgYFSvnx5cSWLFy+WzJkziyf64IMPpGXLlql2uyIiYwySicjlAw8EExMmTDC6fe3atep2TzRq1ChJly6dnD9/XrZv357Sq+NxLAW4M2bMUD8SiMg9MEgmIpeHEdOJEyfK/fv3xV1ER0cn+rGXLl2S1157TUJCQiRr1qyS2tY/Na+rtWVlypTJY0fRidwRg2Qicnn16tWTnDlzyvjx4y22GT16dILUg+nTp0v+/PkT7A7/4osvJEeOHCqgGTt2rMTGxsqgQYMkKChI8uTJI4sWLTKb4vDqq6+qgL106dKye/duo/vPnDkjjRs3lvTp06tlv//++3Lnzh39/bVr15Y+ffpI//79JVu2bNKwYUOzryM+Pl6tE9YjICBAvaZNmzbp78cI5vHjx1UbXMbrNrV+/Xr12uLi4tR1jHii7dChQ/VtunbtKu3bt9df//HHH6VUqVLqOdFnU6ZMMVombhs3bpx06NBBMmbMKN27d1e3Y+Q0X758kjZtWnnzzTfl7t27Ro/7/fffpU6dOpIhQwb1uEqVKsmxY8fMvnbt9c2bN0/1ZZo0aaRgwYLyww8/GLW5evWqvPPOO+o14j1r0aKFGt01fZ8///xzyZ07txQrVszijw08Fu8X3rfKlSvLtm3bbL7uAgUKqPsqVKig1hfvreHzGr6XkyZNksKFC6t+RT9hnSyxtQ0RUfJikExELs/Hx0cFtrNmzZJr16690LJ27Ngh169flz179sjUqVNV6kLTpk0lS5YscvjwYenZs6f06NEjwfMgiP7444/lxIkTUq1aNWnWrJk+IHzw4IG8/vrrKmhCAIig9ubNmyqQM7RkyRLx9/eX/fv3y/z5882uH3bZI0D98ssv5dSpUyqYbt68ufz111/q/vDwcBXMYl1w+ZNPPkmwjBo1asjjx4/VugICegTmu3bt0rfBbVpwh6Ab69qmTRs5ffq0CrxHjBiRIHUA61SuXDm1XNyP/urSpYsK/hGIIxj+7LPPjB7z3nvvqYD/6NGj6nkQqPv5+Vl9j7Ds1q1bqwAbj8d6/fHHH+q+mJgY1ScIuvfu3av6EkFlo0aNjEZ5kYaCdJStW7eqHw3mPHnyRN544w3VFq8Jy8D7euXKFauv+8iRI+p2BNR4D9asWWN2+cOGDVNpQnjMuXPnZPny5Sr4NcfebYiIkpGOiMiFdezYUdeiRQt1+ZVXXtF17txZXf7pp590hh9ho0aN0pUrV87osdOmTdOFhIQYLQvX4+Li9LcVK1ZMV6NGDf312NhYXbp06XQrVqxQ10NDQ9XzTJgwQd8mJiZGlydPHt3EiRPV9XHjxukaNGhg9NxXr15Vjzt//ry6XqtWLV2FChVsvt7cuXPrPv/8c6PbKleurPvwww/11/E68XqtqVixom7y5MnqcsuWLdUy/f39dY8fP9Zdu3ZNrduFCxfU/e3atdPVr1/f6PGDBg3SlSxZUn8d/YblGGrbtq3ujTfeMLrt3Xff1WXKlEl/PUOGDLrFixfr7IX16tmzp9FtVatW1fXq1Utd/r//+z/1nsXHx+vvj4qK0qVJk0a3efNm/fucI0cOdbujSpUqpZs1a5bV161tEydOnLC4rT569EgXEBCg++abb8w+j+ky7NmGiCh5cSSZiFIN5CVjNFYbVUwMjMJ6e//30YeRvTJlyhiNWiPP99atW0aPw+ixxtfXV15++WX9emDEc+fOnWpEUzsVL15cv0tfg1QDax49eqRGuatXr250O647+ppr1aqlRo4Rd2LEtVWrVlKiRAnZt2+fGkVGGkKRIkVUWyzb3HNi9FpL2QC8ZkN4XNWqVS32EwwcOFCldiBlBqOqhv1hiekycN2wry9evKhGkrW+RspFZGSk0bLxnmLU3hqMJGMkHv2C1A0sC89jOpJs+rrtgeVERUVJ3bp17Wpv7zZERMnHNxmfi4johdSsWVPtasdubOR/GkLg+3wg8j/YNW/KdFc/ckrN3YZ8Unsh2MJuegTxpnLlyqW/jIoUyQWpFAsXLlTBF14fAi7chsAZB0AiiHZUYtYfqRvt2rWTDRs2yK+//qrSW1auXKnylxMDfY0fG8uWLUtwX/bs2R1aVwTISMdAOgXyhpED/dZbbyU4OC8xrxvLcoS92xARJR+OJBNRqoLRyHXr1snBgwcTBEg3btwwCpSdWYP20KFD+ss40A/5tRiBhIoVK8rZs2fVQV4ItgxPjgRYODAMI7zIszWE6yVLlnRofbW85GnTpukDYi1IxknLRwa8DnPPWbRoUTWybgkeh7xkS/2kwXIGDBggW7ZsUSPa5g6MtLYMXDfsa4xwBwcHJ+hrVJdwBF4jfmwhYMfIMw4ONTwA0BJthNpwlN0URukRKNtbos9Z2xAROQ+DZCJKVRDM4GCumTNnGt2OoO/27duqmgB2T8+ZM0eNXDoLlvfTTz+pKhe9e/dWo7GdO3dW9+H6vXv3pG3btuoANTz/5s2bpVOnTlYDKXNwgCBGE1etWqUOPMOBbgj2+/Xr59BycCBi2bJl1YirFhBjJP63336TCxcuGI0k4yBABHOo4oD7kNIye/ZsswcFGvroo4/UAWYYiUXgiscYVuJ49uyZOqgPQXlYWJgKStE/WsBryerVq9UoONYFI884UA7LAbz3OAgRVSmQRhIaGqqWj3Vx9KBOBLI46A79ixF3jHjbswcBAToCYO3guocPHyZogyooQ4YMkcGDB8t3332ntgkE+wsWLDC7TGduQ0TkHAySiSjVQfkz02AGgdfcuXNVMItKBAisbAV5jo5g44RlI6/3l19+UcEaaKO/CGYaNGigAnmUekOeq2H+sz0Q7CGPF4ErloNADM+l5Q87AoEw1kkLkpG7ixFpjJgalkXDKOb333+v0iBQ3m7kyJGqj01TWkxh1r9vvvlGVeRAv2CkePjw4fr7MQqNCiAon4bRZFRqQImzMWPGWF0u7se6IMhHgLlixQr9SDpKzaEyCcqpaXnWqLCBnGSMxDsC1U3wYwKl/ZDqgFQe9IUtyEnHj7SvvvpKvfcI2M1BVQu8j+hPrOe7776bINdd48xtiIicwwtH7zlpWURERC8E+eAYsbd3emcioqTCn6dERERERCYYJBMRERERmWAJOCIichnMACQiV8GRZCIiIiIiEwySiYiIiIhMMEgmIiIiIjLBIJmIiIiIyASDZCIiIiIiEwySiYiIiIhMMEgmIiIiIjLBIJmIiIiISIz9P+ipjbzERJcSAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Count words per training example\n", + "lengths = train_df[\"text\"].astype(str).str.split().map(len)\n", + "\n", + "plt.figure(figsize=(8, 4))\n", + "lengths.hist(bins=50, edgecolor=\"black\", alpha=0.7)\n", + "plt.xlabel(\"Number of words per article\")\n", + "plt.ylabel(\"Number of articles\")\n", + "plt.title(\"Distribution of Article Lengths (Training Set)\")\n", + "\n", + "# Add a vertical line at the median for reference\n", + "plt.axvline(lengths.median(), color=\"red\", linestyle=\"--\", label=f\"Median: {lengths.median():.0f} words\")\n", + "plt.legend()\n", + "\n", + "# Save the plot to reports/ so it's available outside the notebook\n", + "reports_dir = PROJECT_ROOT / \"reports\"\n", + "reports_dir.mkdir(exist_ok=True)\n", + "plt.savefig(reports_dir / \"eda_token_lengths.png\", dpi=160, bbox_inches=\"tight\")\n", + "print(f\"Saved plot to: {reports_dir / 'eda_token_lengths.png'}\")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4c. Label Distribution\n", + "\n", + "Let's visualize how many articles belong to each category. A balanced dataset is easier to model — if one class dominates, the model can get high accuracy just by always predicting that class." + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAGGCAYAAABmPbWyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABHMUlEQVR4nO3dCbxV4x7/8V/zPM+RBmmeiyalOYQibujS7JZCoohUhIgmpC6pcEWD4qpUlCSVaCBNNBC3ydCkeVj/1/f5/9f+730657ROzukM+/N+vdbZZ6317LXX3mvvvX77eX7Ps9J5nucZAAAA4pU+/tUAAAAgaAIAAAiImiYAAIAACJoAAAACIGgCAAAIgKAJAAAgAIImAACAAAiaAAAAAiBoAgAACICgCbiIfvrpJ0uXLp29+OKLibbNJUuWuG3qNrENHTrUbftiaNKkiZtiPq+ZM2delMfv3LmzlSpVypLLqlWrLHPmzPbzzz9f1MfVa6zjfCH0eul1S+vq1atnAwYMSO7dQApA0AScx5QpU9yJ5ZtvvkkTz8OfsmbNasWLF7fWrVvbSy+9ZIcPH06Ux9m1a5c7Ca9bt85SmpS8b48//rjdcccdVrJkyXOOVVxTcgZ5ye2vv/6yIUOGWJUqVSxHjhxWoEABq1Gjhj3wwAPuOCfUxo0b3XtDP2xieuSRR2zcuHG2Z8+eRNp7pFYZk3sHAFxcTz31lJUuXdpOnTrlTgKq0enbt6+NGjXK/vvf/1q1atVCZQcNGmSPPvpogravE9aTTz7pTug6iQW1cOFCS2rx7dvrr79uZ8+eteSgIO7TTz+15cuXu/nGjRvb22+/HVGme/fudtVVV9k999wTWpYzZ86//djHjh2zjBkv7FSwZcsWS5/+4v/21ntXr9HmzZutU6dOdt9997kgasOGDTZ16lS7+eab3Q+ChAZNem+otjNmMNq2bVvLnTu3vfrqq+7zg+hF0AREmeuuu87q1KkTmh84cKAtXrzYbrjhBrvpppts06ZNli1bNrdOJ9MLPaEGdfToUcuePbtrmkpOmTJlSrbHnjx5sl122WWuGUjKlCnjpnA9e/Z0y/75z3/GuZ3Tp0+7wC8hr6VqHC9UlixZLDl88MEHtnbtWnvnnXfszjvvjFh3/PhxO3nyZKI+ngLDW2+91d566y0XWF2sJmukPDTPAYlAX9KDBw+22rVrW548eVxzQaNGjeyzzz6L8z6jR492TTEKUK655hr7/vvvzymjX9L6ss6fP787uSnYUW1QYmvWrJk98cQTLp/mP//5T7w5TZ988oldffXVljdvXlfTUb58eXvsscfcOtVaXXnlle7/Ll26hJqR1Nwk+hWv5pTVq1e7mgIFS/59Y+Y0+c6cOePKFC1a1L2uCux++eWXQLk14ds8377FltN05MgRe+ihh6xEiRIuQNBzVT6a53kR5bSdPn36uJO5np/KVq5c2ebPnx/o9df9dAwScjIOz48bM2aMXX755e5xVWOSkPdjzJwm/5hv3brVvSY6ztqGXjMFuPG97n6z4pdffmn9+vWzQoUKucdWzc9vv/0WcV8Fd3os1QjpfdC0aVO370HypLZt2+ZuGzZseM46fU5UK5SQz5H2+7bbbnP/az/890Z4nmDLli3d5yMlNu3i4qGmCUgEhw4dsokTJ7qclB49erj8oDfeeMPlCynBN2ZTkH6xqkzv3r3dL+OxY8e6k+b69eutSJEiroyaGnRSuOSSS1wTmU4+06dPt3bt2tn777/vTkSJ6a677nLBiZrJ9Bxio31SjZSa8NRMoZO0Tq46SUrFihXdcp2w1YykE7U0aNAgtI0//vjD1XbdfvvtrtbEf75xeeaZZ9wJTHkl+/btcwFCixYt3MnLrxELIsi+hVNgpABNgUa3bt3cMVywYIH179/f/ve//7mgN9yyZcts1qxZdu+991quXLlcnlj79u1t586dLt8mLtqWytSqVcsutJZK7yE9Jx0PBQYJfT/G5h//+Idrxh0+fLitWbPGba9w4cL2/PPPn/e+ai7Lly+fyzlScKdjpqBy2rRpETWcI0aMsBtvvNHt17fffutu9VzORz82/M+RmpDjCzaDfI4UwN9///3umOkzoPeK+LeiAFT0Xq9Zs+Z59xFplAcgXpMnT1a1gvf111/HWeb06dPeiRMnIpbt37/fK1KkiNe1a9fQsh07drhtZcuWzfv1119Dy7/66iu3/MEHHwwta968uVe1alXv+PHjoWVnz571GjRo4F1xxRWhZZ999pm7r27/7vPIkyePV7NmzdD8kCFD3H18o0ePdvO//fZbnNvQ9lVGjxfTNddc49ZNmDAh1nWaYj6vSy65xDt06FBo+fTp093ysWPHhpaVLFnS69Sp03m3Gd++6f7aju+DDz5wZZ9++umIcrfeequXLl06b+vWraFlKpc5c+aIZd9++61b/vLLL3vx+fTTT125jz76KN5yOXLkiHiO/nspd+7c3r59+y7o/ejvu45zzGMes9zNN9/sFShQIGJZzNfdf4+1aNHCvVd9el9nyJDBO3DggJvfs2ePlzFjRq9du3YR2xs6dKi7f2zHMtzRo0e98uXLu7Lah86dO3tvvPGGt3fv3nPKBv0czZgx47yfIx3jXr16xbtvSNtongMSQYYMGUJ5JGp2+PPPP11+iZoB9Cs9Jv3K1S9fnxJ869ata/PmzXPzur/yjPRrX7UEv//+u5tUS6Nf4z/++KOroUhsam6Lrxedmmrkww8/vOCkadWGqKknqLvvvtvV3PjUzFKsWLHQa5VUtH0dV9VAhFNznWKNjz/+OGK5ar/UROZTbZyaibZv3x7v4+iYimpmLoRqs9QM9nfej7FRDlU41cxpX1WLdT6q9Qqv/dF91czqD6ewaNEitz+qlYtZQxWEahi/+uorV+vnN6+pNlDvC23jxIkTSfI50jHS/RG9CJqARPLmm2+6E6VyJtQcoxPZ3Llz7eDBg+eUveKKK85ZVq5cuVB3ZzV56cSsPCNtJ3xSk4eoqSqxqQdSeIASU4cOHVxTh3pyqVlNTWxq6khIAKVgMSGJyjFfK52My5YtG2vX8MSkE7zybWK+Hn6TTczxlJTIHdtJdv/+/YEeL2aeVFBqQvu778fYxHw+flAX5Pmc777+a6fjGE5Ni0GDR+VZqXlP7wNNan5Uztkrr7xiw4YNS5LPkbZFEnh0I6cJSARKnlbyqmqQ9OtXuR/6ta98ED9pNSH8IOThhx92v4hjE/OE83f9+uuv7oQa33b1C3/p0qUuz0cnYCU6K09F+VjKhdJzPp+E5CEFFdeJTLUbQfYpMcT1OOcLhvx8p6DBVZDXMzHejxf6fP7ufS+Ecpy6du3q8pPUw1C96p5++ulE/xwdOHDAChYsmGj7jdSHoAlIBBq1Wl/WSgQOP4H7v2ZjUrNATD/88EOo95bf3Vzd4NXsczH44wLFdXIJ737dvHlzN2lsp2effdYNzKhASvua2L/EY75WOvGqBiF8PCnVTuiEFpNqNMK77idk33Qi1thJatYJr21STyx/fWKoUKGCu92xY4cl1/vxYvNfOx3H8JoyNZtdaPDovw/UROr3RE3I5+h87w0146lXYnhyOKIPzXNAIvB/WYf/klbOxYoVK+LsYh6eS6EeTSqvXmWimgF1lf/3v/9tu3fvPuf+Mbtv/13K+1CThk5gHTt2jLOcckRi8nti+Xkk6p0ksQUxF8LvaRgeEOg18V8r0Yly5cqVEePzzJkz55yhCRKyb9dff72rqVJzTzj1mtMJNvzx/w41V2pIg8QccT6h78eLTQG3xv8aP358xPKYr3Vc1NMuttwiBckatkDNdAn9HJ3vvaFhMuLrbYnoQE0TENCkSZNiHXdHl21QN3z9qlfzQJs2bVytwYQJE6xSpUouTyi2JgGNddSrVy8XbKhLtpppwq9vpcs2qEzVqlVdt3H9at67d6878akpTSeOC6EEZtWWKBFX21PApLGX9OtfY9fEN9ihuuyreU7PUeWVD6JRki+99FK3r34Ao4RxPX/V0OhkpCT3uHJvzkd5Ltq2kse1v3qt9PqFD4ugHCsFU9dee61L+lUTlJqowhOzE7pv6gqvMXtUi6acmerVq7smSCXBawT1mNv+OzTi9OzZsxMtZyah78eLTflw+tyMHDnSDeug46b3s96bav4632ug96tqzXRfDQiqDgxKuNdnVJ+n8HGngn6OFPwr2NSQCmqmVocFNTsr8PIfU7laDDcQ5ZK7+x6Q0vndqOOafvnlF9eF+dlnn3Xdn7NkyeK67c+ZM+ecbux+N/EXXnjBGzlypFeiRAlXvlGjRq6Lekzbtm3z7r77bq9o0aJepkyZXPf7G264wZs5c+YFDzngT+o+re22bNnSdd8P79Yf15ADixYt8tq2besVL17c3V+3d9xxh/fDDz9E3O/DDz/0KlWq5LqVh3fxV/f/ypUrx7p/cQ058O6773oDBw70Chcu7IZqaNOmjffzzz+fc3+9nnp99Ho2bNjQ++abb87ZZnz7FvNYyeHDh113eT1Pvf7qoq5jF96dXrSd3r17n7NPcQ2FENOaNWvcNr744osEDzmg/Ykp6PsxviEHYg4r4b9/9LhxPb+4hrWI7T2qYRGeeOIJ9x7UcW3WrJm3adMmN6xBz5494329tm/f7g0ePNirV6+ee1/oWBYqVMi9NxYvXnxBnyN5/fXXvTJlyrjhEcL398yZM16xYsW8QYMGxbtfSPvS6U9yB24AEO3UZKXeejGvORdN1DSmvCQlcauGL6VQc7ou16IaTA1rgOhFThMApABKqFdPxJhDGaRVulBwTGp6ldgup5Oc1GSnEc0JmEBNEwDgotOAlJqUcK+cJF2G5t1337VWrVq5y9UAKRGJ4ACAi05DRqgHnQao1CjjfnK4muaAlIqaJgAAgADIaQIAAAiAoAkAACAAcpoSia5xtGvXLjdgHhd0BAAg9dDoS7rygIb90KWi4kLQlEgUMOlSCAAAIHXSpZd0hYO4EDQlEv+CnnrBc+fOnVibBQAASUw9OFXxEX5x7tgQNCUSv0lOARNBEwAAqc/50mtIBAcAAAiAoAkAACAAgiYAAIAACJoAAAACIGgCAAAIgKAJAAAgAIImAACAlB40jR8/3qpVqxYa26h+/fr28ccfh9YfP37cevfubQUKFLCcOXNa+/btbe/evRHb2Llzp7Vp08ayZ89uhQsXtv79+9vp06cjyixZssRq1aplWbJksbJly9qUKVPO2Zdx48ZZqVKlLGvWrFa3bl1btWpVEj5zAACQ2iRr0KShyp977jlbvXq1ffPNN9asWTNr27atbdiwwa1/8MEH7aOPPrIZM2bY559/7i5Vcsstt4Tuf+bMGRcwnTx50pYvX25vvvmmC4gGDx4cKrNjxw5XpmnTprZu3Trr27evde/e3RYsWBAqM23aNOvXr58NGTLE1qxZY9WrV7fWrVvbvn37LvIrAgAAUiwvhcmXL583ceJE78CBA16mTJm8GTNmhNZt2rTJ0y6vWLHCzc+bN89Lnz69t2fPnlCZ8ePHe7lz5/ZOnDjh5gcMGOBVrlw54jE6dOjgtW7dOjR/1VVXeb179w7NnzlzxitevLg3fPjwwPt98OBBt2+6BQAAqUfQc3iKyWlSrdF7771nR44ccc10qn06deqUtWjRIlSmQoUKdtlll9mKFSvcvG6rVq1qRYoUCZVRDZGuIePXVqlM+Db8Mv42VEulxwovoysca94vAwAAkOzXnlu/fr0LkpS/pLyl2bNnW6VKlVxTWubMmS1v3rwR5RUg7dmzx/2v2/CAyV/vr4uvjAKrY8eO2f79+13AFluZzZs3x7nfJ06ccJNP20tOv/32W7LvQ3JQLlyhQoUs2kTr8RaOeXTheEeX3Cn8Oz3Zg6by5cu7AOngwYM2c+ZM69Spk8tfSumGDx9uTz75pKWUE+g/u3S3Pw8ftWiTP1d2+8/kiSn6Q5bYovl4C8c8unC8o0v+FP6dnuxBk2qT1KNNateubV9//bWNHTvWOnTo4JrODhw4EFHbpN5zRYsWdf/rNmYvN793XXiZmD3uNK9oNlu2bJYhQwY3xVbG30ZsBg4c6JLHffrVX6JECUsOemydQAvVb2858kfWmKVlR/7ca7+teN89/5T6AUsK0Xq8hWMeXcec483xTmmSPWiK6ezZs67ZSwFUpkyZbNGiRW6oAdmyZYsbYkDNeaLbZ555xvVy03AD8sknn7iASE18fpl58+ZFPIbK+NtQ0KbH0uO0a9cutA+a79OnT5z7qeELNKUk+jLNXfhSiya/WfSKxuMtHPPoOuYcb453SpKsQZNqa6677jqX3H348GGbOnWqG1NJwwHkyZPHunXr5mpz8ufP7wKh++67zwU79erVc/dv1aqVC47uuusuGzFihMtfGjRokBvbyQ9oevbsaa+88ooNGDDAunbtaosXL7bp06fb3LlzQ/uhx1CzYJ06deyqq66yMWPGuIT0Ll26JNtrAwAAUpZkDZpUQ3T33Xfb7t27XZCkgS4VMLVs2dKtHz16tOvJppom1T6p19urr74aur+a1ebMmWO9evVywVSOHDlc8PPUU0+FypQuXdoFSBrzSc1+Ghtq4sSJbls+NQUqT0TjOynwqlGjhs2fP/+c5HAAABC9kjVoeuONN+Jdr9G5NVK3priULFnynOa3mJo0aWJr166Nt4ya4uJrjgMAANEtxYzTBAAAkJIRNAEAAARA0AQAABAAQRMAAEAABE0AAAABEDQBAAAEQNAEAAAQAEETAABAAARNAAAAARA0AQAABEDQBAAAEABBEwAAQAAETQAAAAEQNAEAAARA0AQAABAAQRMAAEAABE0AAAABEDQBAAAEQNAEAAAQAEETAABAAARNAAAAARA0AQAABEDQBAAAEABBEwAAQAAETQAAAAEQNAEAAARA0AQAABAAQRMAAEAABE0AAAABEDQBAAAEQNAEAAAQAEETAABAAARNAAAAARA0AQAABEDQBAAAEABBEwAAQAAETQAAAAEQNAEAAKT0oGn48OF25ZVXWq5cuaxw4cLWrl0727JlS0SZJk2aWLp06SKmnj17RpTZuXOntWnTxrJnz+62079/fzt9+nREmSVLllitWrUsS5YsVrZsWZsyZco5+zNu3DgrVaqUZc2a1erWrWurVq1KomcOAABSm2QNmj7//HPr3bu3rVy50j755BM7deqUtWrVyo4cORJRrkePHrZ79+7QNGLEiNC6M2fOuIDp5MmTtnz5cnvzzTddQDR48OBQmR07drgyTZs2tXXr1lnfvn2te/futmDBglCZadOmWb9+/WzIkCG2Zs0aq169urVu3dr27dt3kV4NAACQkmVMzgefP39+xLyCHdUUrV692ho3bhxarhqkokWLxrqNhQsX2saNG+3TTz+1IkWKWI0aNWzYsGH2yCOP2NChQy1z5sw2YcIEK126tI0cOdLdp2LFirZs2TIbPXq0C4xk1KhRLjjr0qWLm9d95s6da5MmTbJHH300CV8FAACQGqSonKaDBw+62/z580csf+edd6xgwYJWpUoVGzhwoB09ejS0bsWKFVa1alUXMPkUCB06dMg2bNgQKtOiRYuIbaqMlotqqRSohZdJnz69m/fLxHTixAn3GOETAABIu5K1pinc2bNnXbNZw4YNXXDku/POO61kyZJWvHhx++6771wNkvKeZs2a5dbv2bMnImASf17r4iujQOfYsWO2f/9+18wXW5nNmzfHmY/15JNPJtKzBwAAKV2KCZqU2/T999+7ZrNw99xzT+h/1SgVK1bMmjdvbtu2bbPLL7/ckotqvJQD5VMAVqJEiWTbHwAAEAVBU58+fWzOnDm2dOlSu/TSS+Mtq15tsnXrVhc0KdcpZi+3vXv3uls/D0q3/rLwMrlz57Zs2bJZhgwZ3BRbmbhyqdQLTxMAAIgOyZrT5HmeC5hmz55tixcvdsna56Peb6IaJ6lfv76tX78+opebeuIpIKpUqVKozKJFiyK2ozJaLkoWr127dkQZNRdq3i8DAACiW8bkbpKbOnWqffjhh26sJj8HKU+ePK4GSE1wWn/99ddbgQIFXE7Tgw8+6HrWVatWzZXVEAUKju666y43FIG2MWjQILdtvyZI4zq98sorNmDAAOvatasL0KZPn+56x/nU1NapUyerU6eOXXXVVTZmzBg39IHfmw4AAES3ZA2axo8fHxrAMtzkyZOtc+fOrgZIQwn4AYxyhtq3b++CIp+a1dS016tXL1crlCNHDhf8PPXUU6EyqsFSgKSAa+zYsa4JcOLEiaHhBqRDhw7222+/ufGdFHhp6AINiRAzORwAAESnjMndPBcfBUkaAPN81Ltu3rx58ZZRYLZ27dp4y6ipUBMAAECKHqcJAAAgpSJoAgAACICgCQAAIACCJgAAgAAImgAAAAIgaAIAAAiAoAkAACAAgiYAAIAACJoAAAACIGgCAAAIgKAJAAAgAIImAACAAAiaAAAAAiBoAgAACICgCQAAIACCJgAAgAAImgAAAAIgaAIAAAiAoAkAAOBiBE1nzpyxdevW2f79+//upgAAANJO0NS3b1974403QgHTNddcY7Vq1bISJUrYkiVLkmIfAQAAUl/QNHPmTKtevbr7/6OPPrIdO3bY5s2b7cEHH7THH388KfYRAAAg9QVNv//+uxUtWtT9P2/ePLvtttusXLly1rVrV1u/fn1S7CMAAEDqC5qKFCliGzdudE1z8+fPt5YtW7rlR48etQwZMiTFPgIAACS7jAm9Q5cuXewf//iHFStWzNKlS2ctWrRwy7/66iurUKFCUuwjAABA6guahg4dalWqVLFffvnFNc1lyZLFLVct06OPPpoU+wgAAJD6gia59dZb3e3x48dDyzp16pR4ewUAAJDac5qUyzRs2DC75JJLLGfOnLZ9+3a3/IknnggNRQAAAGDRHjQ988wzNmXKFBsxYoRlzpw5tFxNdhMnTkzs/QMAAEidQdNbb71lr732mnXs2DGit5zGbtJ4TQAAAGlRgoOm//3vf1a2bNlzlp89e9ZOnTqVWPsFAACQuoOmSpUq2RdffBHrSOE1a9ZMrP0CAABI3b3nBg8e7HrKqcZJtUuzZs2yLVu2uGa7OXPmJM1eAgAApLaaprZt27przn366aeWI0cOF0Rt2rTJLfNHBwcAAEhrLmicpkaNGtknn3yS+HsDAACQVmqaAAAAolGgmqZ8+fK568wF8eeff/7dfQIAAEidQdOYMWOS5MGHDx/uEsk1vlO2bNmsQYMG9vzzz1v58uVDZXSploceesjee+89O3HihLVu3dpeffVVK1KkSKjMzp07rVevXvbZZ5+5UcqVqK5tZ8z4/5/ekiVLrF+/frZhwwYrUaKEDRo0yDp37hyxP+PGjbMXXnjB9uzZ48adevnll+2qq65KkucOAADSYNCUVNeV+/zzz61379525ZVX2unTp+2xxx6zVq1a2caNG12SuTz44IM2d+5cmzFjhuXJk8f69Oljt9xyi3355Zehy7q0adPGihYtasuXL7fdu3fb3XffbZkyZbJnn33WldmxY4cr07NnT3vnnXds0aJF1r17dytWrJgLwmTatGkuqJowYYLVrVvXBYpap56BhQsXTpLnDwAA0nAi+Lx589xI4H6w4Vu4cKELYK677rrA25o/f37EvC7PogBl9erV1rhxYzt48KC7nt3UqVOtWbNmrszkyZOtYsWKtnLlSqtXr557XAVZ6s2n2qcaNWq4a+M98sgjNnToUHepFwVCpUuXtpEjR7pt6P7Lli2z0aNHh57HqFGjrEePHtalSxc3r/soWJs0aZI9+uijCX2ZAABAtCeCK4BQcBSTxmz6u8GFgiTJnz+/u1XwpFHGW7RoESpToUIFu+yyy2zFihVuXrdVq1aNaK5TIHTo0CHXFOeXCd+GX8bfxsmTJ91jhZdJnz69m/fLxKSmQj1G+AQAANKuBAdNP/74oxsVPCYFM1u3br3gHVHQ1bdvX2vYsKG7+K8ot0g1RXnz5o0oqwBJ6/wy4QGTv95fF18ZBTrHjh2z33//3QWCsZXxtxGTcqbUXOhPypMCAABpV4KDJgUI27dvP2e5AiY/D+lCKLfp+++/dwnfqcHAgQNdzZg//fLLL8m9SwAAIKWNCK4aoW3btkUETOrhdtNNN13QTii5W5dgUe+3Sy+9NLRcyd1qOjtw4EBE+b1797p1fhnNx1zvr4uvTO7cuV2vvYIFC7o8rdjK+NuIKUuWLO7+4RMAAEi7Ehw0jRgxwtUoqTlOydWalFhdoEABe/HFFxO0Lc/zXMA0e/ZsW7x4sdtWuNq1a7tecOrt5lNvNg0xUL9+fTev2/Xr19u+fftCZTRauYIYvxlRZcK34Zfxt6EmQD1WeBk1F2reLwMAAKJbxgtpnlPXfgUd3377raupqVatmuvtdiFNcuoZ9+GHH1quXLlC+UN6DG1Xt926dXNDASg5XIHQfffd5wIZ9ZwTDVGg4Oiuu+5yAZ22oTGYtG3VBomGGnjllVdswIAB1rVrVxegTZ8+3fWO8+kxNLRCnTp13NhMGnLgyJEjod50AAAgul3Qtec0OriCFU1/x/jx491tkyZNIpZrWAF/4EkNC6CebO3bt48Y3NKnZjU17WlwSwVTqgVT8PPUU0+FyqgGSwGSxnwaO3asawKcOHFixLAJHTp0sN9++81dgFiBl4Yu0JAIMZPDAQBAdAoUNL300kt2zz33WNasWd3/8bn//vsT1Dx3PnpMjdStKS4lS5Z040fFR4HZ2rVr4y2jpkJNAAAAFxQ0qbanY8eOLoDR//HVQCUkaAIAAEhTQZMuQxLb/wAAANEiwb3nlCt09OjRc5ZrkMjwPCIAAICoDpqefPJJ++uvv85ZrkBK6wAAANKiBAdNSt5W7lJMGn7Av2YcAABA1A45kC9fPhcsaSpXrlxE4KTrtqn2SeMhAQAARHXQpMEeVcukwSHVDKeBJ30aUbtUqVKMng0AANKswEGTBow8ffq0q2Fq1qyZlShRImn3DAAAILXmNGXMmNGNvK3rsgEAAESTBCeC67ps5xtZGwAAwKL92nP33nuvPfTQQ/brr79a7dq13bXewunivQAAABbtQdPtt9/ubsMvl6I8J38oAvWkAwAAsGgPmriMCgAAiEYJDppKliyZNHsCAACQloIm38aNG23nzp128uTJiOU33XRTYuwXAABA6g6atm/fbjfffLOtX78+lMsk/gjh5DQBAIC0KMFDDjzwwANWunRp27dvn2XPnt02bNhgS5cutTp16tiSJUuSZi8BAABSW03TihUrbPHixVawYEFLnz69m66++mobPny461HHGE4AACAtSnBNk5rfcuXK5f5X4LRr165QgviWLVsSfw8BAABSY01TlSpV7Ntvv3VNdHXr1rURI0a4C/a+9tprVqZMmaTZSwAAgNQWNA0aNMiOHDni/n/qqafshhtusEaNGlmBAgVs2rRpSbGPAAAAqS9oat26dej/smXL2ubNm+3PP/+0fPnyhXrQAQAApDUXPE5TuPz58yfGZgAAANJOIjgAAEA0ImgCAAAIgKAJAAAgsYKmWrVq2f79+0M95o4ePRrkbgAAANEVNG3atCk0zMCTTz5pf/31V1LvFwAAQOrrPVejRg3r0qWLu1yKLtD74osvWs6cOWMtO3jw4MTeRwAAgNQRNE2ZMsWGDBlic+bMcWMxffzxx5Yx47l31TqCJgAAELVBU/ny5e29995z/+sCvYsWLbLChQsn9b4BAACk3sEtz549mzR7AgAAkNZGBN+2bZuNGTPGJYhLpUqV7IEHHrDLL788sfcPAAAgdY7TtGDBAhckrVq1yqpVq+amr776yipXrmyffPJJ0uwlAABAaqtpevTRR+3BBx+055577pzljzzyiLVs2TIx9w8AACB11jSpSa5bt27nLO/atatt3LgxsfYLAAAgdQdNhQoVsnXr1p2zXMvoUQcAANKqBAdNPXr0sHvuuceef/55++KLL9ykprp//etfbl1CLF261G688UYrXry4G+Ppgw8+iFjfuXNntzx8uvbaayPK/Pnnn9axY0fLnTu35c2b19WCxRyx/LvvvrNGjRpZ1qxZrUSJEjZixIhz9mXGjBlWoUIFV6Zq1ao2b968BD0XAACQtiU4p+mJJ56wXLly2ciRI23gwIFumYKeoUOH2v3335+gbenSLNWrV3dNe7fcckusZRQkTZ48OTSfJUuWiPUKmHbv3u2S0E+dOuVGLldQN3XqVLf+0KFD1qpVK2vRooVNmDDB1q9f7x5PAZbKyfLly+2OO+6w4cOH2w033ODu265dO1uzZo1VqVIloS8RAABIgxIcNKm2R4ngmg4fPuyWKYi6ENddd52b4qMgqWjRonHmV82fP9++/vprq1Onjlv28ssv2/XXX+8u9aJg7p133rGTJ0/apEmTLHPmzK6Xn5oSR40aFQqaxo4d64Kz/v37u/lhw4a5IOyVV15xgRYAAECCm+fCKVi60IApqCVLlrhcKY1K3qtXL/vjjz9C61asWOFqjPyASVSjpFHLNQyCX6Zx48YuYPK1bt3atmzZYvv37w+V0f3CqYyWAwAAXPDglheLan/UbFe6dGk3oOZjjz3maqYUzGTIkMH27NlzTvK5romXP39+t050q/uHK1KkSGhdvnz53K2/LLyMv43YnDhxwk0+NQMCAIC0K0UHTbfffnvofyVnayBNjTqu2qfmzZsn674p/+nJJ59M1n0AAACppHnuYitTpowVLFjQtm7d6uaV67Rv376IMqdPn3Y96vw8KN3u3bs3oow/f74yceVSiZLgDx48GJp++eWXRHqWAAAg1QdN6p2mGp4ff/zRksOvv/7qcpqKFSvm5uvXr28HDhyw1atXh8osXrzYXVS4bt26oTIa2kD77lOSt3Kk1DTnl1m0aFHEY6mMlseXoK5hDsInAACQdiUoaMqUKZMb8yixaDwl9WTzB8vcsWOH+3/nzp1unXqzrVy50n766ScX1LRt29bKli3rkrSlYsWKLu9J40PpWnhffvml9enTxzXrqeec3HnnnS4JXOM3bdiwwaZNm+Z6y/Xr1y+0H7rYsHrhaRiFzZs3u+ETvvnmG7ctAACAC2qe++c//2lvvPFGorx6Ckxq1qzpJlEgo/8HDx7sEr0VoN10001Wrlw5F/TUrl3bDaYZPlaThhTQoJSqAdNQA1dffbW99tprofV58uSxhQsXuoBM93/ooYfc9v3hBqRBgwZubCbdT+NGzZw50w20yRhNAADgghPBlTOkMY8+/fRTF4TkyJEjYr3GPwqqSZMm5nlenOsXLFhw3m2op5w/kGVclECuYCs+t912m5sAAAASJWj6/vvvrVatWu7/H3744ZyBLwEAANKiBAdNn332WdLsCQAAQFocckDd/tV8duzYMTcfXzMbAABA1AVN6vKvpGslZyvxWhfLFSVqK8kaAAAgLUpw0KQL9WroAQ0LkD179tDyDh06uG77AAAAaVGCc5rUfV/NcpdeemnE8iuuuMJ+/vnnxNw3AACA1FvTdOTIkYgaJp8uXRI+fhIAAEBUB02NGjWyt956K2KYAV22ZMSIEda0adPE3j8AAIDU2Tyn4EiJ4BrN++TJkzZgwAB3eRLVNOkyJgAAAGlRgmuadGkRDWqpy5XoWnBqrrvlllts7dq1dvnllyfNXgIAAKS2mib/em6PP/544u8NAABAWgqa9u/f7y7au2nTJjdfqVIl69Kli7sOHAAAQFqU4Oa5pUuXWqlSpeyll15ywZMm/V+6dGm3DgAAIC1KcE1T79693UCW48ePtwwZMrhlZ86csXvvvdetW79+fVLsJwAAQOqqadI153S5FD9gEv3fr18/tw4AACAtSnDQVKtWrVAuUzgtq169emLtFwAAQOprnvvuu+9C/99///32wAMPuFqlevXquWUrV660cePG2XPPPZd0ewoAAJDSg6YaNWq4kb89zwst06CWMd15550u3wkAACAqg6YdO3Yk/Z4AAACk9qCpZMmSSb8nAAAAaW1wy127dtmyZcts37597mK94ZTzBAAAYNEeNE2ZMsX+9a9/WebMma1AgQIu18mn/wmaAABAWpTgoOmJJ56wwYMH28CBAy19+gSPWAAAAJAqJTjqOXr0qN1+++0ETAAAIKokOGjq1q2bzZgxI2n2BgAAIK00zw0fPtxuuOEGmz9/vlWtWtUyZcoUsX7UqFGJuX8AAACpN2hasGCBlS9f3s3HTAQHAABIixIcNI0cOdImTZpknTt3Tpo9AgAASAs5TVmyZLGGDRsmzd4AAACklaBJF+t9+eWXk2ZvAAAA0krz3KpVq2zx4sU2Z84cq1y58jmJ4LNmzUrM/QMAAEidQVPevHntlltuSZq9AQAASCtB0+TJk5NmTwAAAFIwroMCAACQFDVNpUuXjnc8pu3btyd0kwAAAGkvaOrbt2/E/KlTp2zt2rVuhPD+/fsn5r4BAACk3qBJQw7EZty4cfbNN98kxj4BAACk3Zym6667zt5///3E2hwAAEDaDJpmzpxp+fPnT9B9li5dajfeeKMVL17c5Ul98MEHEes9z7PBgwdbsWLFLFu2bNaiRQv78ccfI8r8+eef1rFjR8udO7cbDqFbt272119/RZT57rvvrFGjRpY1a1YrUaKEjRgx4px9mTFjhlWoUMGV0YWI582bl6DnAgAA0rYEB001a9a0WrVqhSbNK6h57LHH3JQQR44cserVq7umvdgouHnppZdswoQJ9tVXX1mOHDmsdevWdvz48VAZBUwbNmywTz75xA24qUDsnnvuCa0/dOiQtWrVykqWLGmrV6+2F154wYYOHWqvvfZaqMzy5cvtjjvucAGX8rPatWvnpu+//z6hLw8AAEijEpzTpGAiXPr06a1QoULWpEkTV1OT0CY9TbFRLdOYMWNs0KBB1rZtW7fsrbfesiJFirgaqdtvv902bdrkEtC//vprq1OnjiujS7xcf/319uKLL7oarHfeecdOnjzpLjKcOXNmN4r5unXrbNSoUaHgauzYsXbttdeGEtmHDRvmgrBXXnnFBWwAAAAJDpqGDBlyUV61HTt22J49e1yTnC9PnjxWt25dW7FihQuadKsmOT9gEpVXIKeaqZtvvtmVady4sQuYfKqtev75523//v2WL18+V6Zfv34Rj68yMZsLw504ccJN4TVaAAAg7Uqxg1sqYBLVLIXTvL9Ot4ULF45YnzFjRpdbFV4mtm2EP0ZcZfz1sRk+fLgL4vxJuVIAACDtChw0qfYmQ4YM8U4KWKLFwIED7eDBg6Hpl19+Se5dAgAASShwlDN79uw416l5SwnbZ8+eTaz9sqJFi7rbvXv3ukRzn+Zr1KgRKrNv376I+50+fdr1qPPvr1vdJ5w/f74y/vrYZMmSxU0AACA6BK5pUjJ2zEmJ31OmTHFJ17fddptt2bIl0XZMl2tR0LJo0aKIvCHlKtWvX9/N6/bAgQOuV5xv8eLFLnhT7pNfRj3qNHK5T0ne5cuXd/lMfpnwx/HL+I8DAABwQTlNu3btsh49erjxjFSzo95ob775puvWnxAaT0n31eQnf+v/nTt3unGbdMmWp59+2v773//a+vXr7e6773Y94vwefBUrVnS93rQvq1atsi+//NL69OnjksRVTu68806XBK7hBDQ0wbRp01xvufDEb41yrl54I0eOtM2bN7shCTS6ubYFAAAgCUpCUu7Os88+67r1q4lMtTMaNPJCKTBp2rRpaN4PZDp16uRqsAYMGODGctLQAKpRuvrqq11wowEofRpSQMFN8+bNXd5V+/btXVOhT0naCxcutN69e1vt2rWtYMGCbsDM8LGcGjRoYFOnTnXDG2isqSuuuML1nKtSpQrvEgAAkLCgSQNNqpu+mszefffd0NhJf4fGdtJ4THFRbdNTTz3lpriop5wCnvhUq1bNvvjii3jLqHlREwAAwN8Kmh599FF3KZOyZcu6pjhNsZk1a1bQTQIAAKS9oEn5RKr5AQAAiEaBgyblGAEAAESrFDsiOAAAQEpC0AQAABAAQRMAAEAABE0AAAABEDQBAAAEQNAEAAAQAEETAABAAARNAAAAARA0AQAABEDQBAAAEABBEwAAQAAETQAAAAEQNAEAAARA0AQAABAAQRMAAEAABE0AAAABEDQBAAAEQNAEAAAQAEETAABAAARNAAAAARA0AQAABEDQBAAAEABBEwAAQAAETQAAAAEQNAEAAARA0AQAABAAQRMAAEAABE0AAAAETQAAAImDmiYAAIAACJoAAAACIGgCAAAIgKAJAAAgAIImAACA1B40DR061NKlSxcxVahQIbT++PHj1rt3bytQoIDlzJnT2rdvb3v37o3Yxs6dO61NmzaWPXt2K1y4sPXv399Onz4dUWbJkiVWq1Yty5Ili5UtW9amTJly0Z4jAABIHVJ00CSVK1e23bt3h6Zly5aF1j344IP20Ucf2YwZM+zzzz+3Xbt22S233BJaf+bMGRcwnTx50pYvX25vvvmmC4gGDx4cKrNjxw5XpmnTprZu3Trr27evde/e3RYsWHDRnysAAEi5MloKlzFjRitatOg5yw8ePGhvvPGGTZ061Zo1a+aWTZ482SpWrGgrV660evXq2cKFC23jxo326aefWpEiRaxGjRo2bNgwe+SRR1wtVubMmW3ChAlWunRpGzlypNuG7q/AbPTo0da6deuL/nwBAEDKlOJrmn788UcrXry4lSlTxjp27Oia22T16tV26tQpa9GiRaismu4uu+wyW7FihZvXbdWqVV3A5FMgdOjQIduwYUOoTPg2/DL+NuJy4sQJt53wCQAApF0pOmiqW7eua06bP3++jR8/3jWlNWrUyA4fPmx79uxxNUV58+aNuI8CJK0T3YYHTP56f118ZRQEHTt2LM59Gz58uOXJkyc0lShRItGeNwAASHlSdPPcddddF/q/WrVqLogqWbKkTZ8+3bJly5as+zZw4EDr169faF5BFoETAABpV4quaYpJtUrlypWzrVu3ujwnJXgfOHAgoox6z/k5ULqN2ZvOnz9fmdy5c8cbmKmnncqETwAAIO1KVUHTX3/9Zdu2bbNixYpZ7dq1LVOmTLZo0aLQ+i1btricp/r167t53a5fv9727dsXKvPJJ5+4AKdSpUqhMuHb8Mv42wAAAEjxQdPDDz/shhL46aef3JABN998s2XIkMHuuOMOl0fUrVs310T22WefucTwLl26uGBHPeekVatWLji666677Ntvv3XDCAwaNMiN7aSaIunZs6dt377dBgwYYJs3b7ZXX33VNf9pOAMAAIBUkdP066+/ugDpjz/+sEKFCtnVV1/thhPQ/6JhAdKnT+8GtVRvNvV6U9DjU4A1Z84c69WrlwumcuTIYZ06dbKnnnoqVEbDDcydO9cFSWPHjrVLL73UJk6cyHADAAAg9QRN7733Xrzrs2bNauPGjXNTXJQ4Pm/evHi306RJE1u7du0F7ycAAEj7UnTzHAAAQEpB0AQAABAAQRMAAEAABE0AAAABEDQBAAAEQNAEAAAQAEETAABAAARNAAAAARA0AQAABEDQBAAAEABBEwAAQAAETQAAAAEQNAEAAARA0AQAABAAQRMAAEAABE0AAAABEDQBAAAEQNAEAAAQAEETAABAAARNAAAAARA0AQAABEDQBAAAEABBEwAAQAAETQAAAAEQNAEAAARA0AQAABAAQRMAAEAABE0AAAABEDQBAAAEQNAEAAAQAEETAABAAARNAAAAARA0AQAABEDQBAAAEABBEwAAQAAETQAAAAEQNAEAAARA0BTDuHHjrFSpUpY1a1arW7eurVq1KsjrCAAA0jiCpjDTpk2zfv362ZAhQ2zNmjVWvXp1a926te3bty/5jhAAAEgRCJrCjBo1ynr06GFdunSxSpUq2YQJEyx79uw2adKk5DtCAAAgRSBo+n9Onjxpq1evthYtWvz/Fyd9eje/YsWK5Do+AAAghciY3DuQUvz+++925swZK1KkSMRyzW/evPmc8idOnHCT7+DBg+720KFDdrEdPnzYzpw+bQd2/2Snjh+1aHFk/z73vPX8k+N1Ty7ReryFYx5dx5zjzfG+WPxziOd58ZYjaLpAw4cPtyeffPKc5SVKlLBks3yJRaOaNWtaVIrS4y0c8+jC8Y4uNZPxO10/SvPkyRPneoKm/6dgwYKWIUMG27t3b8QLpPmiRYue88INHDjQJY37zp49a3/++acVKFDA0qVLZ9FC0bkCxV9++cVy586d3LuDJMbxjj4c8+gSrcfb8zwXMBUvXjzecgRN/0/mzJmtdu3atmjRImvXrl0oENJ8nz59znnhsmTJ4qZwefPmtWilD1c0fcCiHcc7+nDMo0s0Hu888dQw+QiawqjmqFOnTlanTh276qqrbMyYMXbkyBHXmw4AAEQ3gqYwHTp0sN9++80GDx5se/bssRo1atj8+fPPSQ4HAADRh6ApBjXFxdYch9ipiVKDgcZsqkTaxPGOPhzz6MLxjl8673z96wAAAMDglgAAAEEwIjgAAEAABE1IckOHDnVJ9fHp3LlzaKgHAMlnyZIlbqy5AwcOcBiAGAiaopQuRpwrVy47ffp0aNlff/1lmTJlsiZNmsT6Jbpt27Zk2FP8XeoR2qtXL7vssstckqcGa23durV9+eWXSf7ilipVyg3dgcSjHxj6PPqTBtS99tpr7bvvvkuU7Tdo0MB2794daMwapB5TpkxJtrEEO6ehH8UETVGqadOmLkj65ptvQsu++OILd0L96quv7Pjx46Hln332mTvhXn755Ql6DPUxCA/KkDzat29va9eutTfffNN++OEH++9//+sC4z/++CNJL4CNpKMgSYGNJg3AmzFjRrvhhhsSbaBffQ9E05UNouEHkIbU0ec/Jn0vXHrppRGBeGyTgi4QNEWt8uXLW7FixVwtkk//t23b1kqXLm0rV66MWK4gSxcovv/++61w4cKWNWtWu/rqq+3rr7+OKKcP18cff+xGV9eHetmyZec8ti6MrIFE9atHv5IHDBhw3osk4sKoiUXB8PPPP++OYcmSJd3ArboM0E033eTK6JiNHz/errvuOsuWLZuVKVPGZs6cGbGd9evXW7Nmzdx6HbN77rnHBd0xf0k+88wz7jIEen8pMPv555/twQcfDH3xipbdeOONli9fPsuRI4dVrlzZ5s2bxyFOAP+EqUlN348++qi77IVOqrE1r61bt84t++mnn857DGLe36+hWLBggVWsWNFy5swZCtrCTZw40a3Xd0OFChXs1VdfjQiiNZSLvnO0Xu9DXb9T9NlXE74fCOj9o+8ZJO4PIH129d0d04cffmj33XdfKAjX9NBDD7n3RPgyBV0gaIpqOomqFsmn//UBvOaaa0LLjx075mqeVFbBzfvvv+8+sGvWrLGyZcu6Xzm65l44fYE/99xztmnTJqtWrdo5jzty5Ej3RTxp0iQXVOn+s2fPvgjPOProBKfpgw8+cEFvXJ544gn3hfztt99ax44d7fbbb3fHTzQqvo6zTrAKkmfMmGGffvrpOeOZqcZjy5Yt9sknn9icOXNs1qxZ7hfsU089Ffrild69e7t9Wbp0qQvGFNBpH3FhFLz+5z//cZ9HBbRBJPQYHD161F588UV7++233X127txpDz/8cGj9O++84wYFVtCs982zzz7r3lP6rpCXXnrJneCnT5/u3iMqr6Zb0XfK6NGj7d///rf9+OOP7r1atWpV3g4X8ANIZf71r3+5AZkVnFapUsV9FuNqnlOLwsKFC92PZT8I16T3gmov/XkFW2pmL126tAu+qlevfs4Pqw0bNrjaTl16RakfjRo1OielQ+8hBc56n+o9eOrUqdR3nDVOE6LT66+/7uXIkcM7deqUd+jQIS9jxozevn37vKlTp3qNGzd2ZRYtWqQqIO+nn37yMmXK5L3zzjuh+588edIrXry4N2LECDf/2WefubIffPBBxOMMGTLEq169emi+WLFiofuIHv/SSy/12rZtexGedfSZOXOmly9fPi9r1qxegwYNvIEDB3rffvttaL2OWc+ePSPuU7duXa9Xr17u/9dee83d/6+//gqtnzt3rpc+fXpvz549br5Tp05ekSJFvBMnTkRsp2TJkt7o0aMjllWtWtUbOnRokjzXaKDXOkOGDO6zq0nHT5+p1atXR3wO9+/fH7rP2rVr3bIdO3ac9xjEvP/kyZPd/NatW0Nlxo0b54637/LLL3ffG+GGDRvm1a9f3/1/3333ec2aNfPOnj17zuONHDnSK1eunPs+Qdz0PZkzZ06vb9++3vHjx89Zf+bMGa9evXpe5cqVvYULF3rbtm3zPvroI2/evHmh45gnT56I+8yZM8e99jHF/M5++umnvQoVKnjz589329W2smTJ4i1ZssSt//XXX738+fN7t9xyi/f11197W7Zs8SZNmuRt3rw59J7NnTu3+57ZtGmT26/s2bO775bUhpymKKZaJdUiqPZAv2DKlStnhQoVcjVNfl6TqurVXHPw4EH3q6Bhw4ah+ytpXL90/BoJn67dFxdtRzUOdevWDS3TL5r47oO/RzVIu3btcr/01ayiY1qrVq2IHIX69etH3Efz/nHVrX5ZqhnHp/eBLmitWgOfageUD3M+anp5+umn3TY0mnxiJTBHE9U0qMlN06pVq1xNoJpX1ewWREKPQfbs2SNyGlVbsG/fPve/vkNUo9CtW7dQzaYmbd+vaVDzrfZVzbZ6bNVu+G677TZXo63vmR49erhaZ3Ihz6XvSX1mVXunGiMdu8ceeyx07FT7q/eCanhbtmzpXk/V/Oh9ERc1zfm1VHFRjaRqDtUy0Lp1a7ddHc9//vOfrnZQxo0b5zoOvPfee+67XOcSXbNVx9unmupXXnnFNd1qv9q0aeNqp1MbgqYopup8NZ+oKU6TgiVRTkGJEiVs+fLlbrlyWRIi/OSKlEFV9foiVZOJjqu+9HSyTExBj3v37t1t+/btdtddd7mmIX3Jvvzyy4m6L2mdXmt9fjVdeeWVLp9Iwcvrr79u6dP/36/18DzBmM0gCT0G+oEUTjlP/vb93DY9th/Iafr+++9DuZEK0nfs2GHDhg1zAdI//vEPu/XWW906fdco+FYOlJp+7r33XmvcuHHqbLpJxh9Aes31fa6AJQgdv48++ui8QdPWrVtd86y+P3KGBcVvvfVWKCjWY6s5Lub7JJxypDJkyBBr4J2aEDRFOf1i1QdPU/hQA/rSUkK3frmojH5lqhYhvJeGvtRUS1WpUqXAj6dfI/qwqCbLp1+Vq1evTsRnhfPRMdNJ1hee+O/PK6lXdKtcp/Dyeh/o5Bz+SzI2es8o8T8mnSh79uzpfhUr6VQnXFw4BTE6HgpIVFss4YnaOqkl1TFQ/ox+aCkI8wM5f1IOjE+5Lkom1uNMmzbN5TL5+ZAKlpSYrtwnfRetWLHCBXMI/gNIr2FC6Ltd370aYiI+flA8d+7ciKB448aNobymII8dW+Ct2urUhgv2RjkFRH5Cnl/TJPpfib7q9aIy+mWrrq79+/e3/Pnzu54uI0aMcL9AVC2fEA888IBLFL/iiitcVe2oUaMYSC+JqFeNmj+6du3qkvKVoKlhJnTslPzpU3K3ahvUI1JJuvpCfeONN9w6JYbrS7lTp06ul5N6aKm3jWopdMKMj5J9lTisxHL1jCpYsKD17dvXNRnoF/H+/ftdbaYfoCEYNZns2bPH/a/XUM0eOrkp8FCwooBIx0qJ2eplpc4X4RL7GDz55JOu2U0/ilQDov3T+0zbVk9Zfcb1Y6lmzZouuNP7TQnGamZSLYkCazXZqxlQSe06CSvRGcF+ACl5Xp/vX3/91R3vILVNappTE1l47U9c29dnV8n/14SdI8LpsdVsqPNIfLVNaUJyJ1UheSkxVG8DJfmFU+K3lpcvXz607NixYy6hs2DBgi4JsGHDht6qVatC62NLQI0tqVAJjQ888IBLDMybN6/Xr18/7+677yYRPAkoYfTRRx/1atWq5ZJAlXypYzpo0CDv6NGjroyOmRJ7W7Zs6Y5rqVKlvGnTpkVs57vvvvOaNm3qksmV8NmjRw/v8OHDofVK9IwtkX/FihVetWrV3Hb9r5s+ffq4xGEtK1SokHfXXXd5v//+e1I8/TRJr7VeS3/KlSuXd+WVV7qEf9+yZctcsreOV6NGjbwZM2ZEJILHdwxiSwSPmUA8e/bs0PH0qZNIjRo1vMyZM7uOA+pMMmvWLLdOCb9ap8R1fe6bN2/urVmzJrQtdTzQcq1XMvOnn36axK9i6qPjo8/g22+/7TpybN++3Zs+fbpLyO/atasr06RJE69KlSouEVzrlQT+8ccfx3oclTD+/vvvx/pYMb+zH3/8ca9AgQLelClTXIcAdTp46aWX3Ly/b1rvJ4L/8MMP3ltvvRWRCB7z+0HngGuuucZLbQiagCink59OXABS9w+gP/74w+vSpYsLYBQwK4BSD7mYQZMCHwXM4T1i4wua1OtxzJgx7vHUi1qBduvWrb3PP/88VEaBXKtWrdx+KZBXsK6edmktaEqnP8ld2wUg+Si3QD2W0splDgDET82l6m3HoLIJRyI4AABRRL3sNCgmEo6aJgAAgACoaQIAAAiAoAkAACAAgiYAAIAACJoAAAACIGgCAAAIgKAJAAAgAIImAKmOrrum69+VKVPGXRdL11rTddcWLVoU6P663pmuewYACcEFewGkKj/99JM1bNjQBT0vvPCCVa1a1V0odMGCBe7i05s3b7bUJioudAqkAdQ0AUhV7r33Xnfpl1WrVln79u3dFd0rV65s/fr1s5UrV4YuE6FgKkeOHK4WSvf566+/3LolS5ZYly5d7ODBg247moYOHerWnThxwh5++GG75JJL3H3r1q3ryod7/fXX3TazZ89uN998s3usmLVW48ePt8svv9wyZ85s5cuXt7fffjtivR5TZW666Sb3OE8//bSVLVvWXnzxxYhy69atc2W3bt2aJK8lgARK7ovfAUBQuiBpunTpvGeffTbecqNHj/YWL17s7dixw1u0aJG70GivXr3cuhMnTriLj+bOndvbvXu3mw4fPuzWde/e3WvQoIG3dOlSd1HTF154wV3YVFdtl2XLlnnp06d3y7ds2eKNGzfOy58/f8TV42fNmuUuaqp1KjNy5EgvQ4YMbn98+uotXLiwN2nSJHdR059//tl75plnvEqVKkU8j/vvv99r3LgxbxAghSBoApBqfPXVVy7gUGCSEDNmzHBXfveFX/Hdp8BFwc3//ve/iOXNmzf3Bg4c6P7v0KGD16ZNm4j1HTt2jNiWgq4ePXpElLntttu866+/PjSv59C3b9+IMnpcPb6eo5w8edIrWLCgN2XKlAQ9VwBJh+Y5AKnG/403zk9XcG/evLlrZsuVK5fddddd9scff9jRo0fjvM/69evtzJkzrrkvZ86coenzzz+3bdu2uTJbtmyxq666KuJ+Mec3bdrkcq7CaV7Lw9WpUydivnjx4tamTRubNGmSm//oo49cc+Ftt90W6DkDSHokggNINa644gqX4xNfsrcSxW+44Qbr1auXPfPMM5Y/f35btmyZdevWzU6ePOlykWKjnKcMGTLY6tWr3W04BU+JTblMMXXv3t0FeKNHj7bJkydbhw4d4txfABcfNU0AUg0FQK1bt7Zx48bZkSNHzll/4MABF/ScPXvWRo4cafXq1XM1R7t27YoopwRt1SqFq1mzplu2b98+l5QdPhUtWtSVUVL3119/HXG/mPMVK1a0L7/8MmKZ5itVqnTe53f99de7YEpJ4vPnz7euXbsGeFUAXCwETQBSFQVMCm7ULPb+++/bjz/+6Jq+XnrpJatfv74LctSF/+WXX7bt27e7nmsTJkyI2EapUqVczZLGdfr9999ds52Cq44dO9rdd99ts2bNsh07drgeesOHD7e5c+e6+2lsqHnz5rkec3rcf//73/bxxx+72i9f//793ThQCnxURmW1PfXKOx/VcHXu3NkGDhzoatX0fACkIEmYLwUASWLXrl1e7969vZIlS3qZM2f2LrnkEu+mm27yPvvsM7d+1KhRXrFixbxs2bJ5rVu39t566y2XfL1///7QNnr27OmSw7V8yJAhoeTrwYMHe6VKlXI94LSNm2++2fvuu+9C93vttdfc42nb7dq1855++mmvaNGiEfv36quvemXKlHHbKFeunHv8cHrM2bNnx/rc1JtO60eMGJGorxmAvy+d/iR34AYAqVWPHj1cjtUXX3yRKNvTdpTE/ssvv1iRIkUSZZsAEgeJ4ACQABqAsmXLli73SE1zb775pr366qt/+zVUT7nffvvNDbSpHnMETEDKQ04TACSA8pwUNGnEceVKKZdKvd7+rnfffddKlizpktlHjBjBMQFSIJrnAAAAAqCmCQAAIACCJgAAgAAImgAAAAIgaAIAAAiAoAkAACAAgiYAAIAACJoAAAACIGgCAAAIgKAJAADAzu//AL1KAUGFs+OSAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Exact counts:\n", + " World: 30,000\n", + " Sports: 30,000\n", + " Business: 30,000\n", + " Sci/Tech: 30,000\n" + ] + } + ], + "source": [ + "# Map numeric labels to readable names for the plot\n", + "label_names = {0: \"World\", 1: \"Sports\", 2: \"Business\", 3: \"Sci/Tech\"}\n", + "\n", + "label_counts = train_df[\"label\"].value_counts().sort_index()\n", + "label_counts.index = [label_names.get(i, i) for i in label_counts.index]\n", + "\n", + "plt.figure(figsize=(6, 4))\n", + "label_counts.plot(kind=\"bar\", edgecolor=\"black\", alpha=0.7)\n", + "plt.xlabel(\"Category\")\n", + "plt.ylabel(\"Number of articles\")\n", + "plt.title(\"Label Distribution (Training Set)\")\n", + "plt.xticks(rotation=0)\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "print(\"\\nExact counts:\")\n", + "for name, count in label_counts.items():\n", + " print(f\" {name}: {count:,}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4d. Sample Texts by Category\n", + "\n", + "Numbers and charts are useful, but nothing beats reading actual examples. Let's look at one random article from each category to build intuition about what the model will be working with." + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- World (label=0) ---\n", + "Explosion Rocks Baghdad Neighborhood BAGHDAD, Iraq, August 24 -- A car bomb exploded near the gate of a US-funded Iraqi television network in Baghdad on Tuesday, killing at least two people and woundi...\n", + "\n", + "--- Sports (label=1) ---\n", + "Second Andre win in a row boosts US gold medal hopes Andre Dirrell, fighting with a tattoo of his grandfather #39;s face on his back, assured the United States of at least two boxing medals Wednesday ...\n", + "\n", + "--- Business (label=2) ---\n", + "US house sales fall in July Sales of non-new houses in the US fell last month but still exceeded analyst forecasts....\n", + "\n", + "--- Sci/Tech (label=3) ---\n", + "Gartner optimistic about chip numbers But that optimism isn #39;t matched by Infineon, which said that while the market worldwide remains buoyant, the US is a special case and cautioned that growth mi...\n", + "\n" + ] + } + ], + "source": [ + "label_names = {0: \"World\", 1: \"Sports\", 2: \"Business\", 3: \"Sci/Tech\"}\n", + "\n", + "for label_id, name in label_names.items():\n", + " sample = train_df[train_df[\"label\"] == label_id].sample(1, random_state=42).iloc[0]\n", + " print(f\"--- {name} (label={label_id}) ---\")\n", + " # Show the first 200 characters to keep output manageable\n", + " print(f\"{sample['text'][:200]}...\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 5. Save Shared Objects for Notebook 2\n", + "\n", + "We'll save the DataFrames and config so the next notebook can pick up right where we left off without re-downloading the dataset.\n", + "\n", + "> **Why not just re-run the download?** You could! But saving intermediate results is a good habit — it saves time on large datasets and makes each notebook self-contained." + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved DataFrames to /Users/Bartley/Documents/personal_dev/h4la/repos/data-science/tutorials/setup_template/data/interim/\n", + " train.parquet (120,000 rows)\n", + " test.parquet (7,600 rows)\n" + ] + } + ], + "source": [ + "# Save DataFrames as parquet files (a compact, fast binary format)\n", + "data_dir = PROJECT_ROOT / \"data\" / \"interim\"\n", + "data_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + "train_df.to_parquet(data_dir / \"train.parquet\", index=False)\n", + "test_df.to_parquet(data_dir / \"test.parquet\", index=False)\n", + "if valid_df is not None:\n", + " valid_df.to_parquet(data_dir / \"valid.parquet\", index=False)\n", + "\n", + "print(f\"Saved DataFrames to {data_dir}/\")\n", + "print(f\" train.parquet ({train_df.shape[0]:,} rows)\")\n", + "print(f\" test.parquet ({test_df.shape[0]:,} rows)\")\n", + "if valid_df is not None:\n", + " print(f\" valid.parquet ({valid_df.shape[0]:,} rows)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Summary\n", + "\n", + "Here's what we accomplished:\n", + "\n", + "- **Set up** a Python virtual environment and installed all dependencies\n", + "- **Loaded** the AG News dataset (120k training articles across 4 categories)\n", + "- **Explored** the data: checked text lengths, verified the labels are balanced, and read sample articles\n", + "- **Saved** the data for the next notebook\n", + "\n", + "**Key takeaways from our EDA:**\n", + "- The dataset is well-balanced (~30k examples per class) — no special handling needed\n", + "- Articles are relatively short (typically 30–50 words) — good for a bag-of-words model\n", + "- The four categories are distinct enough that even a simple model should do reasonably well\n", + "\n", + "**Next up:** [02_train_and_evaluate.ipynb](./02_train_and_evaluate.ipynb) — we'll build a TF-IDF + Logistic Regression classifier and track the experiment with MLflow." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorials/setup_template/notebooks/02_train_and_evaluate.ipynb b/tutorials/setup_template/notebooks/02_train_and_evaluate.ipynb new file mode 100644 index 0000000..ade871e --- /dev/null +++ b/tutorials/setup_template/notebooks/02_train_and_evaluate.ipynb @@ -0,0 +1,597 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook 2: Train & Evaluate a Text Classifier\n", + "\n", + "In the previous notebook, we explored the AG News dataset and saved it for reuse. Now it's time to build a model.\n", + "\n", + "**What you'll do in this notebook:**\n", + "\n", + "1. Load the data we saved in notebook 1\n", + "2. Create a train/validation split\n", + "3. Build a TF-IDF + Logistic Regression pipeline\n", + "4. Train the model and evaluate its performance\n", + "5. Track everything with MLflow\n", + "\n", + "**What you'll learn:**\n", + "- How TF-IDF converts raw text into numbers a model can understand\n", + "- How Logistic Regression works as a simple but effective classifier\n", + "- Why we split data into train/validation/test sets\n", + "- How to read a confusion matrix and classification report\n", + "- How MLflow helps you organize and compare experiments\n", + "\n", + "---\n", + "\n", + "### Notebook series\n", + "\n", + "| Notebook | Focus |\n", + "|---|---|\n", + "| 01 — Setup & EDA | Environment, data loading, exploration |\n", + "| **02 — Train & Evaluate** (you are here) | Build, train, and evaluate a text classifier |\n", + "| 03 — Serve & Predict | Load the trained model and make predictions |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 1. Setup\n", + "\n", + "First, let's import our libraries and reload the data from notebook 1." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project root: /Users/Bartley/Documents/personal_dev/h4la/repos/data-science/tutorials/setup_template\n", + "Random seed set to 42.\n" + ] + } + ], + "source": [ + "import os, sys, yaml\n", + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import (\n", + " accuracy_score, f1_score,\n", + " classification_report, ConfusionMatrixDisplay\n", + ")\n", + "\n", + "import mlflow\n", + "import mlflow.sklearn\n", + "\n", + "# Locate the project root\n", + "PROJECT_ROOT = (\n", + " Path(\"__vsc_ipynb_file__\").resolve().parent.parent\n", + " if \"__vsc_ipynb_file__\" in dir()\n", + " else Path.cwd().parent\n", + ")\n", + "if not (PROJECT_ROOT / \"requirements.txt\").exists():\n", + " PROJECT_ROOT = Path.cwd()\n", + "\n", + "if str(PROJECT_ROOT) not in sys.path:\n", + " sys.path.insert(0, str(PROJECT_ROOT))\n", + "\n", + "from src.utils import set_all_seeds, get_env\n", + "\n", + "set_all_seeds(42)\n", + "print(f\"Project root: {PROJECT_ROOT}\")\n", + "print(\"Random seed set to 42.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load saved data and config" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train: 120,000 rows\n", + "Test: 7,600 rows\n", + "Valid: None (will create below)\n" + ] + } + ], + "source": [ + "# Load config\n", + "cfg_path = PROJECT_ROOT / \"configs\" / \"baseline.yaml\"\n", + "with open(cfg_path) as f:\n", + " cfg = yaml.safe_load(f)\n", + "\n", + "# Load DataFrames saved by notebook 1\n", + "data_dir = PROJECT_ROOT / \"data\" / \"interim\"\n", + "train_df = pd.read_parquet(data_dir / \"train.parquet\")\n", + "test_df = pd.read_parquet(data_dir / \"test.parquet\")\n", + "valid_df = pd.read_parquet(data_dir / \"valid.parquet\") if (data_dir / \"valid.parquet\").exists() else None\n", + "\n", + "print(f\"Train: {len(train_df):,} rows\")\n", + "print(f\"Test: {len(test_df):,} rows\")\n", + "if valid_df is not None:\n", + " print(f\"Valid: {len(valid_df):,} rows\")\n", + "else:\n", + " print(\"Valid: None (will create below)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 2. Create a Train/Validation Split\n", + "\n", + "AG News doesn't come with a built-in validation set, so we need to create one by setting aside some of the training data.\n", + "\n", + "### Why do we need a validation set?\n", + "\n", + "The **test set** should only be used once to get an honest performance estimate. But during development, you often want to check how well your model is doing (e.g., to try different hyperparameters). That's what the **validation set** is for: a \"practice test\" you can use as often as you like without contaminating your final evaluation.\n", + "\n", + "### What is stratified sampling?\n", + "\n", + "When we split the data, we use `stratify=train_df[\"label\"]`. This ensures each split has the same proportion of each class as the original. Without stratification, random chance might give us a validation set with very few \"Sports\" articles, for example, which would make our validation metrics unreliable." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split training data into:\n", + " Train: 96,000 rows (80%)\n", + " Validation: 24,000 rows (20%)\n", + "\n", + "Final sizes:\n", + " Train: 96,000\n", + " Validation: 24,000\n", + " Test: 7,600\n" + ] + } + ], + "source": [ + "if valid_df is None:\n", + " train_df, valid_df = train_test_split(\n", + " train_df,\n", + " test_size=cfg[\"test_size\"], # 20% goes to validation, a good rule of thumb is 60% train - 20% validation 20% test\n", + " random_state=cfg[\"random_state\"], # reproducible split\n", + " stratify=train_df[\"label\"] # keep class proportions balanced\n", + " )\n", + " print(f\"Split training data into:\")\n", + " print(f\" Train: {len(train_df):,} rows (80%)\")\n", + " print(f\" Validation: {len(valid_df):,} rows (20%)\")\n", + "else:\n", + " print(f\"Using existing validation split ({len(valid_df):,} rows)\")\n", + "\n", + "# Separate features (X) from labels (y) — standard ML convention\n", + "X_train, y_train = train_df[\"text\"].astype(str), train_df[\"label\"]\n", + "X_valid, y_valid = valid_df[\"text\"].astype(str), valid_df[\"label\"]\n", + "X_test, y_test = test_df[\"text\"].astype(str), test_df[\"label\"]\n", + "\n", + "print(f\"\\nFinal sizes:\")\n", + "print(f\" Train: {len(X_train):,}\")\n", + "print(f\" Validation: {len(X_valid):,}\")\n", + "print(f\" Test: {len(X_test):,}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 3. Understand the Model: TF-IDF + Logistic Regression\n", + "\n", + "Before we build the model, let's understand what it actually does. Our pipeline has two stages:\n", + "\n", + "### Stage 1: TF-IDF (turning text into numbers)\n", + "\n", + "Machine learning models work with numbers, not raw text. **TF-IDF** (Term Frequency - Inverse Document Frequency) converts each article into a vector of numbers by:\n", + "\n", + "1. **Term Frequency (TF):** How often does each word appear in *this* article?\n", + "2. **Inverse Document Frequency (IDF):** How rare is this word across *all* articles?\n", + "\n", + "Words that appear frequently in one article but rarely overall get high TF-IDF scores. Common words like \"the\" and \"is\" get low scores because they appear everywhere and don't help distinguish categories.\n", + "\n", + "**Key settings from our config:**\n", + "- `max_features=30000` — Only keep the 30,000 most informative terms. This reduces noise and speeds up training.\n", + "- `ngram_range=[1, 2]` — Consider both single words (\"market\") and two-word phrases (\"stock market\"). Bigrams capture useful context that single words miss.\n", + "\n", + "### Stage 2: Logistic Regression (making the prediction)\n", + "\n", + "Despite the name, **Logistic Regression** is a *classification* algorithm (not regression). It learns a set of weights — one per feature — and uses them to compute the probability that an article belongs to each class. The class with the highest probability wins.\n", + "\n", + "**Why start with Logistic Regression?**\n", + "- It's fast to train (seconds, not hours)\n", + "- It's interpretable (you can look at which words matter most)\n", + "- It's a strong baseline — often surprisingly competitive\n", + "- It establishes a performance floor before trying more complex models\n", + "\n", + "**Key settings:**\n", + "- `C=2.0` — Controls regularization. Higher C = less regularization = the model fits the training data more closely. Too high and it overfits; too low and it underfits.\n", + "- `max_iter=200` — Maximum number of optimization steps. If the model hasn't converged by 200 iterations, something might be wrong." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Build the sklearn Pipeline\n", + "\n", + "Scikit-learn's `Pipeline` chains the two steps together so they act as a single unit. This is convenient because:\n", + "- Calling `pipe.fit(X, y)` runs TF-IDF fitting *and* model training in one step\n", + "- Calling `pipe.predict(X)` runs TF-IDF transformation *and* prediction\n", + "- You can't accidentally forget to transform your data before predicting" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pipeline structure:\n", + "Pipeline(steps=[('tfidf',\n", + " TfidfVectorizer(max_features=30000, ngram_range=(1, 2))),\n", + " ('clf', LogisticRegression(C=2.0, max_iter=200))])\n" + ] + } + ], + "source": [ + "pipe = Pipeline([\n", + " # Step 1: Convert text -> TF-IDF feature vectors\n", + " (\"tfidf\", TfidfVectorizer(\n", + " max_features=cfg[\"tfidf\"][\"max_features\"],\n", + " ngram_range=tuple(cfg[\"tfidf\"][\"ngram_range\"])\n", + " )),\n", + " # Step 2: Classify the feature vectors\n", + " (\"clf\", LogisticRegression(\n", + " C=cfg[\"model\"][\"C\"],\n", + " max_iter=cfg[\"model\"][\"max_iter\"]\n", + " ))\n", + "])\n", + "\n", + "print(\"Pipeline structure:\")\n", + "print(pipe)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 4. Train the Model with MLflow Tracking\n", + "\n", + "### What is MLflow?\n", + "\n", + "[MLflow](https://mlflow.org/) is an experiment tracking tool. Every time you train a model, it records:\n", + "- **Parameters** — the settings you used (C, max_features, etc.)\n", + "- **Metrics** — how well the model performed (accuracy, F1 score)\n", + "- **Artifacts** — files like the saved model, plots, and reports\n", + "\n", + "This is invaluable when you're experimenting. Instead of trying to remember \"which run had C=2.0 with bigrams?\", MLflow keeps a log of everything.\n", + "\n", + "### Understanding the metrics\n", + "\n", + "- **Accuracy** — What percentage of predictions were correct? Simple and intuitive, but can be misleading if classes are imbalanced.\n", + "- **F1 Score (macro)** — The harmonic mean of precision and recall, averaged equally across all classes. This gives a balanced view of performance even if some classes are harder than others.\n", + " - **Precision** = Of all articles I predicted as \"Sports\", how many actually were?\n", + " - **Recall** = Of all actual \"Sports\" articles, how many did I find?" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026/02/23 18:25:31 INFO mlflow.tracking.fluent: Experiment with name 't0_setup_template' does not exist. Creating a new experiment.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MLflow tracking URI: /Users/Bartley/Documents/personal_dev/h4la/repos/data-science/tutorials/setup_template/mlruns\n", + "Experiment name: t0_setup_template\n" + ] + } + ], + "source": [ + "# Point MLflow at our local tracking directory\n", + "tracking_uri = get_env(\"MLFLOW_TRACKING_URI\", \"./mlruns\")\n", + "\n", + "tracking_dir = PROJECT_ROOT / \"mlruns\"\n", + "tracking_dir.mkdir(exist_ok=True)\n", + "\n", + "trash_dir = tracking_dir / \".trash\"\n", + "trash_dir.mkdir(exist_ok=True)\n", + "\n", + "mlflow.set_tracking_uri(str(PROJECT_ROOT / tracking_uri))\n", + "mlflow.set_experiment(cfg[\"experiment_name\"])\n", + "\n", + "print(f\"MLflow tracking URI: {PROJECT_ROOT / tracking_uri}\")\n", + "print(f\"Experiment name: {cfg['experiment_name']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with mlflow.start_run() as run:\n", + " # --- Log parameters so we can look them up later ---\n", + " mlflow.log_params({\n", + " \"dataset\": cfg[\"dataset\"],\n", + " \"tfidf_max_features\": cfg[\"tfidf\"][\"max_features\"],\n", + " \"tfidf_ngram_range\": str(cfg[\"tfidf\"][\"ngram_range\"]),\n", + " \"model\": cfg[\"model\"][\"type\"],\n", + " \"C\": cfg[\"model\"][\"C\"],\n", + " \"max_iter\": cfg[\"model\"][\"max_iter\"],\n", + " \"random_state\": cfg[\"random_state\"]\n", + " })\n", + "\n", + " # --- Train the model ---\n", + " print(\"Training... (this may take a minute on the full dataset)\")\n", + " pipe.fit(X_train, y_train)\n", + " print(\"Training complete!\\n\")\n", + "\n", + " # --- Generate predictions on validation and test sets ---\n", + " y_pred_valid = pipe.predict(X_valid)\n", + " y_pred_test = pipe.predict(X_test)\n", + "\n", + " # --- Compute metrics ---\n", + " avg = cfg[\"metrics\"][\"average\"]\n", + " metrics = {\n", + " \"valid_accuracy\": accuracy_score(y_valid, y_pred_valid),\n", + " \"valid_f1_macro\": f1_score(y_valid, y_pred_valid, average=avg),\n", + " \"test_accuracy\": accuracy_score(y_test, y_pred_test),\n", + " \"test_f1_macro\": f1_score(y_test, y_pred_test, average=avg),\n", + " }\n", + " mlflow.log_metrics(metrics)\n", + "\n", + " print(\"Results:\")\n", + " print(f\" Validation — Accuracy: {metrics['valid_accuracy']:.4f}, F1 (macro): {metrics['valid_f1_macro']:.4f}\")\n", + " print(f\" Test — Accuracy: {metrics['test_accuracy']:.4f}, F1 (macro): {metrics['test_f1_macro']:.4f}\")\n", + "\n", + " # --- Save the model as an MLflow artifact ---\n", + " mlflow.sklearn.log_model(pipe, artifact_path=\"model\")\n", + "\n", + " # Keep the run ID — we'll need it to load the model in notebook 3\n", + " RUN_ID = run.info.run_id\n", + " print(f\"\\nMLflow run ID: {RUN_ID}\")\n", + " print(\"Model saved as MLflow artifact.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 5. Evaluate the Model\n", + "\n", + "Overall accuracy tells you *how often* the model is right, but not *where* it goes wrong. That's what the confusion matrix and classification report are for.\n", + "\n", + "### 5a. Confusion Matrix\n", + "\n", + "A confusion matrix is a grid that shows what the model predicted vs. what the actual label was. The diagonal shows correct predictions; off-diagonal cells show mistakes.\n", + "\n", + "**How to read it:**\n", + "- Each **row** is an actual class (what the article really is)\n", + "- Each **column** is a predicted class (what the model guessed)\n", + "- A perfect model would have numbers only on the diagonal\n", + "- Off-diagonal numbers tell you which classes get confused with each other (e.g., \"World\" articles misclassified as \"Business\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reports_dir = PROJECT_ROOT / \"reports\"\n", + "reports_dir.mkdir(exist_ok=True)\n", + "\n", + "fig, ax = plt.subplots(figsize=(7, 6))\n", + "ConfusionMatrixDisplay.from_predictions(\n", + " y_test, y_pred_test,\n", + " display_labels=[\"World\", \"Sports\", \"Business\", \"Sci/Tech\"],\n", + " ax=ax,\n", + " cmap=\"Blues\"\n", + ")\n", + "ax.set_title(\"Confusion Matrix (Test Set)\")\n", + "\n", + "fig.savefig(reports_dir / \"confusion_matrix.png\", dpi=180, bbox_inches=\"tight\")\n", + "print(f\"Saved: {reports_dir / 'confusion_matrix.png'}\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5b. Classification Report\n", + "\n", + "The classification report gives you per-class precision, recall, and F1 score — a more detailed view than the single F1 number above.\n", + "\n", + "**Reminder of what these mean:**\n", + "\n", + "| Metric | Question it answers |\n", + "|---|---|\n", + "| **Precision** | When the model says \"Sports\", how often is it right? |\n", + "| **Recall** | Of all actual Sports articles, what fraction did the model find? |\n", + "| **F1** | The balance between precision and recall (harmonic mean) |\n", + "| **Support** | How many test examples belong to this class |\n", + "\n", + "If a class has high precision but low recall, the model is cautious — it only predicts that class when it's very confident, but misses many actual examples. The reverse (low precision, high recall) means the model over-predicts that class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "label_names = {0: \"World\", 1: \"Sports\", 2: \"Business\", 3: \"Sci/Tech\"}\n", + "target_names = [label_names[i] for i in sorted(label_names.keys())]\n", + "\n", + "report = classification_report(y_test, y_pred_test, target_names=target_names)\n", + "print(\"Classification Report (Test Set):\")\n", + "print(report)\n", + "\n", + "# Save the report as a text file\n", + "report_path = reports_dir / \"classification_report.txt\"\n", + "report_path.write_text(report)\n", + "print(f\"Saved: {report_path}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5c. Log Evaluation Artifacts to MLflow\n", + "\n", + "Let's add the confusion matrix and classification report to our MLflow run so they're stored alongside the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Re-open the same run to add artifacts\n", + "with mlflow.start_run(run_id=RUN_ID):\n", + " mlflow.log_artifact(str(reports_dir / \"confusion_matrix.png\"))\n", + " mlflow.log_artifact(str(reports_dir / \"classification_report.txt\"))\n", + " print(\"Logged evaluation artifacts to MLflow.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 6. Explore MLflow Artifacts\n", + "\n", + "Everything we logged is now stored under `mlruns/`. The model artifact includes:\n", + "\n", + "| File | Purpose |\n", + "|---|---|\n", + "| `model/model.pkl` | The serialized sklearn pipeline (TF-IDF + LogReg) |\n", + "| `model/MLmodel` | Metadata about the model (what flavor, how to load it) |\n", + "| `model/conda.yaml` | Conda environment for reproducibility |\n", + "| `model/requirements.txt` | Pip requirements for the model |\n", + "\n", + "You can explore these visually by launching the MLflow UI:\n", + "\n", + "```bash\n", + "mlflow ui --backend-store-uri ./mlruns\n", + "```\n", + "\n", + "Then open http://localhost:5000 in your browser." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 7. Save the Run ID for Notebook 3\n", + "\n", + "Notebook 3 needs to know which MLflow run contains our trained model. We'll save the run ID to a small file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_id_path = PROJECT_ROOT / \"data\" / \"interim\" / \"latest_run_id.txt\"\n", + "run_id_path.write_text(RUN_ID)\n", + "print(f\"Saved run ID to: {run_id_path}\")\n", + "print(f\"Run ID: {RUN_ID}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Summary\n", + "\n", + "Here's what we accomplished:\n", + "\n", + "- **Split** the training data into train (80%) and validation (20%) sets using stratified sampling\n", + "- **Built** a TF-IDF + Logistic Regression pipeline — a fast, interpretable text classifier\n", + "- **Trained** the model and evaluated it on both validation and test sets\n", + "- **Analyzed** errors using a confusion matrix and per-class classification report\n", + "- **Tracked** everything (parameters, metrics, model, plots) with MLflow\n", + "\n", + "**Things to try on your own:**\n", + "- Change `C` in `configs/baseline.yaml` (try 0.1, 1.0, 10.0) and see how it affects performance\n", + "- Set `ngram_range` to `[1, 1]` (unigrams only) — how much do bigrams help?\n", + "- Reduce `max_features` to 5000 — does the model still work well?\n", + "- Compare all your runs in the MLflow UI\n", + "\n", + "**Next up:** [03_serve_and_predict.ipynb](./03_serve_and_predict.ipynb) — we'll load the saved model and use it to classify new text, just like the FastAPI endpoint does." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorials/setup_template/notebooks/03_serve_and_predict.ipynb b/tutorials/setup_template/notebooks/03_serve_and_predict.ipynb new file mode 100644 index 0000000..d497e2c --- /dev/null +++ b/tutorials/setup_template/notebooks/03_serve_and_predict.ipynb @@ -0,0 +1,318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook 3: Serve & Predict\n", + "\n", + "You've explored the data (notebook 1) and trained a model (notebook 2). Now let's put it to work.\n", + "\n", + "**What you'll do in this notebook:**\n", + "\n", + "1. Load the trained model from MLflow\n", + "2. Run predictions on sample texts\n", + "3. Understand how the FastAPI serving endpoint works\n", + "\n", + "**What you'll learn:**\n", + "- How MLflow stores and loads models\n", + "- How to go from a trained model to a prediction in just a few lines\n", + "- How a model gets served as a web API that other applications can call\n", + "\n", + "---\n", + "\n", + "### Notebook series\n", + "\n", + "| Notebook | Focus |\n", + "|---|---|\n", + "| 01 — Setup & EDA | Environment, data loading, exploration |\n", + "| 02 — Train & Evaluate | Build, train, and evaluate a text classifier |\n", + "| **03 — Serve & Predict** (you are here) | Load the trained model and make predictions |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "\n", + "import mlflow\n", + "import mlflow.pyfunc\n", + "\n", + "# Locate the project root\n", + "PROJECT_ROOT = (\n", + " Path(\"__vsc_ipynb_file__\").resolve().parent.parent\n", + " if \"__vsc_ipynb_file__\" in dir()\n", + " else Path.cwd().parent\n", + ")\n", + "if not (PROJECT_ROOT / \"requirements.txt\").exists():\n", + " PROJECT_ROOT = Path.cwd()\n", + "\n", + "if str(PROJECT_ROOT) not in sys.path:\n", + " sys.path.insert(0, str(PROJECT_ROOT))\n", + "\n", + "from src.utils import get_env\n", + "\n", + "# Point MLflow at our local tracking directory\n", + "tracking_uri = get_env(\"MLFLOW_TRACKING_URI\", \"./mlruns\")\n", + "mlflow.set_tracking_uri(str(PROJECT_ROOT / tracking_uri))\n", + "\n", + "print(f\"Project root: {PROJECT_ROOT}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 2. Load the Trained Model\n", + "\n", + "In notebook 2, we saved our trained model as an MLflow artifact and wrote the run ID to a file. Now we'll load it back.\n", + "\n", + "### How MLflow model loading works\n", + "\n", + "MLflow uses a URI format to locate models: `runs://model`. This tells MLflow:\n", + "- Look in the run with this ID\n", + "- Find the artifact named \"model\"\n", + "- Deserialize it back into a working Python object\n", + "\n", + "We use `mlflow.pyfunc.load_model()` which returns a generic \"PyFunc\" wrapper. This is the same interface the FastAPI server uses — so what you see here is exactly how the API serves predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read the run ID saved by notebook 2\n", + "run_id_path = PROJECT_ROOT / \"data\" / \"interim\" / \"latest_run_id.txt\"\n", + "RUN_ID = run_id_path.read_text().strip()\n", + "print(f\"Loading model from MLflow run: {RUN_ID}\")\n", + "\n", + "# Load the model\n", + "model_uri = f\"runs:/{RUN_ID}/model\"\n", + "loaded_model = mlflow.pyfunc.load_model(model_uri)\n", + "\n", + "print(f\"Model loaded successfully from: {model_uri}\")\n", + "print(f\"Model type: {type(loaded_model)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 3. Make Predictions\n", + "\n", + "Let's test the model on some made-up headlines to see if it can correctly categorize them. These are texts the model has never seen before — a simple \"sanity check\" that the model works as expected.\n", + "\n", + "The AG News categories are:\n", + "\n", + "| Label | Category | Example topics |\n", + "|---|---|---|\n", + "| 0 | World | International politics, diplomacy, conflicts |\n", + "| 1 | Sports | Games, scores, athletes, tournaments |\n", + "| 2 | Business | Markets, earnings, mergers, economy |\n", + "| 3 | Sci/Tech | Research, gadgets, software, space |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Some sample headlines spanning all four categories\n", + "sample_texts = [\n", + " \"NASA launches new spacecraft to explore Mars\",\n", + " \"Stock market hits record high amid economic optimism\",\n", + " \"Champions League final draws record viewership\",\n", + " \"Google unveils latest advances in artificial intelligence\",\n", + " \"UN Security Council holds emergency meeting on conflict\",\n", + " \"Lakers defeat Celtics in overtime thriller\",\n", + "]\n", + "\n", + "label_names = {0: \"World\", 1: \"Sports\", 2: \"Business\", 3: \"Sci/Tech\"}\n", + "\n", + "# Run predictions\n", + "predictions = loaded_model.predict(sample_texts)\n", + "\n", + "print(\"Predictions:\\n\")\n", + "for text, pred in zip(sample_texts, predictions):\n", + " label_int = int(pred)\n", + " category = label_names.get(label_int, str(label_int))\n", + " print(f\" [{category:10s}] {text}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Try your own texts\n", + "\n", + "Edit the cell below to test the model on any text you want. Does it get them right? Can you find examples that trick it?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace this with any text you want to classify\n", + "your_text = \"Apple announces record quarterly revenue driven by iPhone sales\"\n", + "\n", + "pred = loaded_model.predict([your_text])\n", + "label_int = int(pred[0])\n", + "print(f\"Text: {your_text}\")\n", + "print(f\"Category: {label_names[label_int]} (label {label_int})\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 4. How the FastAPI Endpoint Works\n", + "\n", + "In a real application, you wouldn't open a notebook every time you want a prediction. Instead, you'd serve the model as a **web API** — a URL that other programs can send text to and get predictions back.\n", + "\n", + "This project includes a FastAPI server in `src/serve.py` that does exactly what we just did above, but over HTTP.\n", + "\n", + "### The key idea\n", + "\n", + "The serving code is surprisingly short:\n", + "\n", + "```python\n", + "# 1. Load the model once at startup\n", + "model = mlflow.pyfunc.load_model(model_path)\n", + "\n", + "# 2. For each incoming request, run a prediction\n", + "@app.post(\"/infer\")\n", + "def infer(payload: InferRequest):\n", + " pred = model.predict([payload.text])\n", + " return {\"label\": int(pred[0])}\n", + "```\n", + "\n", + "That's it — the same `model.predict()` call we used above, wrapped in a web endpoint.\n", + "\n", + "### How to run it\n", + "\n", + "From the project root (not from this notebook), run:\n", + "\n", + "```bash\n", + "uvicorn src.serve:app --reload\n", + "```\n", + "\n", + "Then send it a request:\n", + "\n", + "```bash\n", + "curl -X POST http://localhost:8000/infer \\\n", + " -H \"Content-Type: application/json\" \\\n", + " -d '{\"text\": \"NASA launches new spacecraft to explore Mars\"}'\n", + "```\n", + "\n", + "You should get back something like:\n", + "```json\n", + "{\"label\": 3}\n", + "```\n", + "\n", + "### What `--reload` does\n", + "\n", + "The `--reload` flag tells uvicorn to watch for file changes and automatically restart the server. This is handy during development but should be turned off in production.\n", + "\n", + "### Why FastAPI?\n", + "\n", + "FastAPI is a popular Python web framework that's:\n", + "- **Fast** — built on async Python for high throughput\n", + "- **Self-documenting** — automatically generates API docs at `/docs`\n", + "- **Type-safe** — uses Pydantic for request/response validation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 5. Look at the Source Code\n", + "\n", + "Throughout these notebooks, we inlined all the logic so you could see every step. But the project also has equivalent module files in `src/` that are ready for production use:\n", + "\n", + "| Module | What it does | Notebook equivalent |\n", + "|---|---|---|\n", + "| `src/data.py` | Loads HF datasets into DataFrames | Notebook 1, section 3 |\n", + "| `src/eda.py` | Prints stats and saves plots | Notebook 1, section 4 |\n", + "| `src/baseline.py` | Trains TF-IDF + LogReg with MLflow | Notebook 2, sections 3–5 |\n", + "| `src/serve.py` | FastAPI inference endpoint | This notebook, section 4 |\n", + "| `src/utils.py` | `set_all_seeds()` and `get_env()` helpers | Used everywhere |\n", + "\n", + "You can run these from the terminal as modules:\n", + "\n", + "```bash\n", + "python -m src.eda # Run EDA\n", + "python -m src.baseline # Train the model\n", + "uvicorn src.serve:app # Serve predictions\n", + "```\n", + "\n", + "The notebooks are for learning and exploration; the `src/` modules are for repeatable, scriptable workflows." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Summary\n", + "\n", + "Here's what we accomplished across all three notebooks:\n", + "\n", + "1. **Setup & EDA** — Installed dependencies, loaded AG News, explored the data\n", + "2. **Train & Evaluate** — Built a TF-IDF + Logistic Regression pipeline, trained it, and evaluated with a confusion matrix and classification report\n", + "3. **Serve & Predict** — Loaded the saved model, made predictions, and learned how the FastAPI endpoint works\n", + "\n", + "### What to explore next\n", + "\n", + "- **Try a different dataset** — Change `dataset` in `configs/baseline.yaml` to `imdb`, `rotten_tomatoes`, or another HF text classification dataset\n", + "- **Tune hyperparameters** — Adjust `C`, `max_iter`, `max_features`, or `ngram_range` and compare runs in MLflow\n", + "- **Swap the model** — Replace `LogisticRegression` with `SGDClassifier` or `RandomForestClassifier`\n", + "- **Launch the API** — Run `uvicorn src.serve:app --reload` and try it with real queries\n", + "- **Browse MLflow** — Run `mlflow ui --backend-store-uri ./mlruns` and explore your experiment history\n", + "\n", + "Happy hacking!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorials/setup_template/notebooks/shell1.png b/tutorials/setup_template/notebooks/shell1.png new file mode 100644 index 0000000..313e8de Binary files /dev/null and b/tutorials/setup_template/notebooks/shell1.png differ diff --git a/tutorials/setup_template/requirements.txt b/tutorials/setup_template/requirements.txt new file mode 100644 index 0000000..ce7c82d --- /dev/null +++ b/tutorials/setup_template/requirements.txt @@ -0,0 +1,13 @@ +pandas==2.2.2 +scikit-learn==1.5.2 +spacy==3.7.6 +matplotlib==3.9.2 +datasets==3.0.1 +transformers==4.44.2 +mlflow==2.16.2 +numpy==1.26.4 +pyyaml==6.0.2 +python-dotenv==1.0.1 +pydantic==2.9.2 +fastapi==0.115.0 +uvicorn==0.30.6 diff --git a/tutorials/setup_template/src/__init__.py b/tutorials/setup_template/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tutorials/setup_template/src/baseline.py b/tutorials/setup_template/src/baseline.py new file mode 100644 index 0000000..c127328 --- /dev/null +++ b/tutorials/setup_template/src/baseline.py @@ -0,0 +1,92 @@ + +import os, yaml, mlflow, mlflow.sklearn +from src.utils import set_all_seeds, get_env +from src.data import load_text_classification +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score, f1_score, classification_report, ConfusionMatrixDisplay +import matplotlib.pyplot as plt + +def run_baseline(cfg_path="configs/baseline.yaml"): + set_all_seeds(42) + cfg = yaml.safe_load(open(cfg_path)) + + mlflow.set_tracking_uri(get_env("MLFLOW_TRACKING_URI", "./mlruns")) + mlflow.set_experiment(cfg["experiment_name"]) + + train_df, valid_df, test_df = load_text_classification( + cfg["dataset"], cache_dir=get_env("DATA_CACHE_DIR", "./.hf_cache") + ) + + if valid_df is None: + train_df, valid_df = train_test_split( + train_df, test_size=cfg["test_size"], random_state=cfg["random_state"], stratify=train_df["label"] + ) + + X_train, y_train = train_df["text"].astype(str), train_df["label"] + X_valid, y_valid = valid_df["text"].astype(str), valid_df["label"] + X_test, y_test = test_df["text"].astype(str), test_df["label"] + + pipe = Pipeline([ + ("tfidf", TfidfVectorizer( + max_features=cfg["tfidf"]["max_features"], + ngram_range=tuple(cfg["tfidf"]["ngram_range"]) + )), + ("clf", LogisticRegression( + C=cfg["model"]["C"], + max_iter=cfg["model"]["max_iter"] + )) + ]) + + with mlflow.start_run(): + # log params + mlflow.log_params({ + "dataset": cfg["dataset"], + "tfidf_max_features": cfg["tfidf"]["max_features"], + "tfidf_ngram_range": str(cfg["tfidf"]["ngram_range"]), + "model": cfg["model"]["type"], + "C": cfg["model"]["C"], + "max_iter": cfg["model"]["max_iter"], + "random_state": cfg["random_state"] + }) + + pipe.fit(X_train, y_train) + y_pred_valid = pipe.predict(X_valid) + y_pred_test = pipe.predict(X_test) + + # metrics + acc_valid = accuracy_score(y_valid, y_pred_valid) + f1_valid = f1_score(y_valid, y_pred_valid, average=cfg["metrics"]["average"]) + acc_test = accuracy_score(y_test, y_pred_test) + f1_test = f1_score(y_test, y_pred_test, average=cfg["metrics"]["average"]) + + mlflow.log_metrics({ + "valid_accuracy": acc_valid, + "valid_f1_macro": f1_valid, + "test_accuracy": acc_test, + "test_f1_macro": f1_test + }) + + # save confusion matrix + os.makedirs("reports", exist_ok=True) + fig = ConfusionMatrixDisplay.from_predictions(y_test, y_pred_test).figure_ + fig.savefig("reports/confusion_matrix.png", dpi=180, bbox_inches="tight") + mlflow.log_artifact("reports/confusion_matrix.png") + + # save text report + report = classification_report(y_test, y_pred_test) + with open("reports/classification_report.txt", "w") as f: + f.write(report) + mlflow.log_artifact("reports/classification_report.txt") + + # save model + mlflow.sklearn.log_model(pipe, artifact_path="model") + + print("Validation -> acc:", acc_valid, "f1_macro:", f1_valid) + print("Test -> acc:", acc_test, "f1_macro:", f1_test) + print("\nClassification report saved at reports/classification_report.txt") + +if __name__ == "__main__": + run_baseline() diff --git a/tutorials/setup_template/src/data.py b/tutorials/setup_template/src/data.py new file mode 100644 index 0000000..009d4dd --- /dev/null +++ b/tutorials/setup_template/src/data.py @@ -0,0 +1,24 @@ +from datasets import load_dataset +import pandas as pd +from collections import Counter + +def load_text_classification(name, cache_dir=None): + """ + Loads a Hugging Face dataset and returns 3 DataFrames: + train_df, valid_df (or None), test_df with columns: text, label + """ + ds = load_dataset(name, cache_dir=cache_dir) + train_df = pd.DataFrame(ds["train"]) + test_df = pd.DataFrame(ds["test"]) + valid_df = pd.DataFrame(ds["validation"]) if "validation" in ds else None + return train_df, valid_df, test_df + +def describe_dataset(df, text_col="text", label_col="label"): + lengths = df[text_col].astype(str).str.split().map(len) + counts = Counter(df[label_col]) + return { + "rows": len(df), + "avg_tokens": float(lengths.mean()), + "median_tokens": float(lengths.median()), + "label_counts": dict(counts), + } diff --git a/tutorials/setup_template/src/eda.py b/tutorials/setup_template/src/eda.py new file mode 100644 index 0000000..6a83c21 --- /dev/null +++ b/tutorials/setup_template/src/eda.py @@ -0,0 +1,31 @@ + +# Quick, beginner-friendly EDA that saves pictures into reports/ +import os, yaml +import matplotlib.pyplot as plt +from src.utils import set_all_seeds, get_env +from src.data import load_text_classification, describe_dataset + +def main(cfg_path="configs/baseline.yaml"): + set_all_seeds(42) + cfg = yaml.safe_load(open(cfg_path)) + cache = get_env("DATA_CACHE_DIR", "./.hf_cache") + + train_df, valid_df, test_df = load_text_classification(cfg["dataset"], cache_dir=cache) + + # 1) Print simple stats + print("TRAIN:", describe_dataset(train_df)) + if valid_df is not None: + print("VALID:", describe_dataset(valid_df)) + print("TEST :", describe_dataset(test_df)) + + # 2) Plot token length histogram (train) + lengths = train_df["text"].astype(str).str.split().map(len) + plt.figure() + lengths.hist(bins=50) + plt.xlabel("Tokens per example"); plt.ylabel("Count"); plt.title("Token Lengths (train)") + os.makedirs("reports", exist_ok=True) + plt.savefig("reports/eda_token_lengths.png", dpi=160, bbox_inches="tight") + print("Saved: reports/eda_token_lengths.png") + +if __name__ == "__main__": + main() diff --git a/tutorials/setup_template/src/serve.py b/tutorials/setup_template/src/serve.py new file mode 100644 index 0000000..1b38e2e --- /dev/null +++ b/tutorials/setup_template/src/serve.py @@ -0,0 +1,22 @@ + +from fastapi import FastAPI +from pydantic import BaseModel +import mlflow.pyfunc, glob + +app = FastAPI(title="T0 Baseline Inference") + +class InferRequest(BaseModel): + text: str + +def _latest_model_path(): + # look for the newest model saved by MLflow locally + candidates = sorted(glob.glob("mlruns/*/*/artifacts/model")) + if not candidates: + raise RuntimeError("No model artifacts found. Run baseline first.") + return candidates[-1] + +@app.post("/infer") +def infer(payload: InferRequest): + model = mlflow.pyfunc.load_model(_latest_model_path()) + pred = model.predict([payload.text]) + return {"label": int(pred[0])} diff --git a/tutorials/setup_template/src/utils.py b/tutorials/setup_template/src/utils.py new file mode 100644 index 0000000..08b2803 --- /dev/null +++ b/tutorials/setup_template/src/utils.py @@ -0,0 +1,23 @@ +import os +import random +import numpy as np + + +def set_all_seeds(seed: int = 42): + random.seed(seed) + np.random.seed(seed) + try: + import torch + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + except Exception: + pass + +def get_env(name: str, default: str = "") -> str: + from dotenv import load_dotenv + load_dotenv() + return os.getenv(name, default) + +