From 6ada37b69cd1e85dc8c4aea5c1badb60484cf021 Mon Sep 17 00:00:00 2001 From: Nwabueze Ugoh Date: Fri, 24 Apr 2026 22:43:13 +0100 Subject: [PATCH 1/4] Update iteration.ipynb to replace references from pandas to polars, enhance output displays for various code snippets, and adjust execution counts for improved clarity and consistency. --- iteration.ipynb | 558 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 470 insertions(+), 88 deletions(-) diff --git a/iteration.ipynb b/iteration.ipynb index fc1452a..5975665 100644 --- a/iteration.ipynb +++ b/iteration.ipynb @@ -21,12 +21,12 @@ "\n", "One tool for reducing duplication is functions, which reduce duplication by identifying repeated patterns of code and extract them out into independent pieces that can be easily reused and updated. Another tool for reducing duplication is *iteration*, which helps you when you need to do the same thing to multiple inputs: repeating the same operation on different columns, or on different datasets.\n", "\n", - "In this chapter you'll learn about iteration in three ways: explicit iteration, using for loops and while loops; iteration via comprehensions (eg list comprehensions); and iteration for **pandas** data frames." + "In this chapter you'll learn about iteration in three ways: explicit iteration, using for loops and while loops; iteration via comprehensions (eg list comprehensions); and iteration for **polars** data frames." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "51a55374", "metadata": { "tags": [ @@ -51,7 +51,7 @@ "source": [ "### Prerequisites\n", "\n", - "This chapter will use the **pandas** data analysis package." + "This chapter will use the **polars** data analysis package." ] }, { @@ -68,10 +68,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "a2bbd41c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Lovelace\n", + "Smith\n", + "Pigou\n", + "Babbage\n" + ] + } + ], "source": [ "name_list = [\"Lovelace\", \"Smith\", \"Pigou\", \"Babbage\"]\n", "\n", @@ -106,10 +117,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "835ebda7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hello\n", + "---end entry---\n" + ] + } + ], "source": [ "for entry in [\"hello\"]:\n", " print(entry)\n", @@ -126,10 +146,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "2a19ac2e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "h\n", + "---end entry---\n", + "e\n", + "---end entry---\n", + "l\n", + "---end entry---\n", + "l\n", + "---end entry---\n", + "o\n", + "---end entry---\n" + ] + } + ], "source": [ "for entry in \"hello\":\n", " print(entry)\n", @@ -150,10 +187,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "239e133f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The name in position 0 is Lovelace\n", + "The name in position 1 is Smith\n", + "The name in position 2 is Hopper\n", + "The name in position 3 is Babbage\n" + ] + } + ], "source": [ "name_list = [\"Lovelace\", \"Smith\", \"Hopper\", \"Babbage\"]\n", "\n", @@ -171,10 +219,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "b66c5c53", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The name in position 1 is Lovelace\n", + "The name in position 2 is Smith\n", + "The name in position 3 is Hopper\n", + "The name in position 4 is Babbage\n" + ] + } + ], "source": [ "for i, name in enumerate(name_list, start=1):\n", " print(f\"The name in position {i} is {name}\")" @@ -190,10 +249,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "010239fe", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In Paris, the temperature is 28 degrees C today.\n", + "In London, the temperature is 22 degrees C today.\n", + "In Seville, the temperature is 36 degrees C today.\n", + "In Wellesley, the temperature is 29 degrees C today.\n" + ] + } + ], "source": [ "cities_to_temps = {\"Paris\": 28, \"London\": 22, \"Seville\": 36, \"Wellesley\": 29}\n", "\n", @@ -217,10 +287,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "8ea3efc5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ada Lovelace\n", + "Adam Smith\n", + "Grace Hopper\n", + "Charles Babbage\n" + ] + } + ], "source": [ "first_names = [\"Ada\", \"Adam\", \"Grace\", \"Charles\"]\n", "last_names = [\"Lovelace\", \"Smith\", \"Hopper\", \"Babbage\"]\n", @@ -257,10 +338,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "7efed381", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[51, 52, 53, 54, 55, 56, 57, 58, 59, 60]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "num_list = range(50, 60)\n", "[1 + num for num in num_list]" @@ -286,10 +378,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "722fda21", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39]\n" + ] + } + ], "source": [ "number_list = range(1, 40)\n", "divide_list = [x for x in number_list if x % 3 == 0]\n", @@ -308,10 +408,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "b6e80d6b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Adam Smith', 'leonara smith']\n" + ] + } + ], "source": [ "names_list = [\"Joe Bloggs\", \"Adam Smith\", \"Sandra Noone\", \"leonara smith\"]\n", "smith_list = [x for x in names_list if \"smith\" in x.lower()]\n", @@ -330,10 +438,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "f348bfb6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Not Smith!', 'Adam Smith', 'Not Smith!', 'leonara smith']\n" + ] + } + ], "source": [ "names_list = [\"Joe Bloggs\", \"Adam Smith\", \"Sandra Noone\", \"leonara smith\"]\n", "smith_list = [x if \"smith\" in x.lower() else \"Not Smith!\" for x in names_list]\n", @@ -350,10 +466,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "74e4fcc7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Ada Lovelace', 'Adam Smith', 'Grace Hopper', 'Charles Babbage']\n" + ] + } + ], "source": [ "first_names = [\"Ada\", \"Adam\", \"Grace\", \"Charles\"]\n", "last_names = [\"Lovelace\", \"Smith\", \"Hopper\", \"Babbage\"]\n", @@ -371,10 +495,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "2c82cf1f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['Ada Lovelace', 'Adam Lovelace'], ['Ada Smith', 'Adam Smith']]\n" + ] + } + ], "source": [ "first_names = [\"Ada\", \"Adam\"]\n", "last_names = [\"Lovelace\", \"Smith\"]\n", @@ -394,10 +526,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "acef16ae", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'Ada': 'Lovelace', 'Adam': 'Smith'}" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "{key: value for key, value in zip(first_names, last_names)}" ] @@ -420,10 +563,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "3e47ba02", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10\n", + "9\n", + "8\n", + "7\n", + "6\n", + "5\n", + "4\n", + "3\n", + "2\n", + "1\n", + "execution complete\n" + ] + } + ], "source": [ "n = 10\n", "while n > 0:\n", @@ -452,11 +613,13 @@ "id": "5ec0643e", "metadata": {}, "source": [ - "## Iteration with **pandas** Data Frames\n", + "## Iteration with **polars** Data Frames\n", "\n", - "For loops, while loops, and comprehensions all work on **pandas** data frames, but they are generally a bad way to get things done because they are slow and not memory efficient. To aid cases where iteration is needed, **pandas** has built-in methods for iteration depending on what you need to do.\n", + "For loops, while loops, and comprehensions can be used with data frames, but in **Polars**, they are even more strongly discouraged than in pandas. **Polars** is built on a columnar, vectorized, and expression-based engine, so row-by-row iteration breaks performance and prevents optimizations.\n", "\n", "These built-in methods for iteration have an overlap with what we've seen in @sec-data-transform but we'll dig a little deeper into `assign()`/assignment operations, `apply()`, and `eval()` here.\n", + "\n", + "Instead of iterating, **Polars** encourages you to use expressions and lazy evaluation, which are much faster and more memory efficient.\n", "\n" ] }, @@ -474,15 +637,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "b3116809", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (6, 4)
abcd
f64f64f64f64
-0.274564-0.7875621.4215290.502933
0.3290070.470936-1.4572540.682447
-0.5765030.492678-0.6358111.311106
0.5108971.906728-1.4333840.310135
0.486463-1.1124421.686876-1.668481
0.6703620.81884-0.7804550.777569
" + ], + "text/plain": [ + "shape: (6, 4)\n", + "┌───────────┬───────────┬───────────┬───────────┐\n", + "│ a ┆ b ┆ c ┆ d │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╡\n", + "│ -0.274564 ┆ -0.787562 ┆ 1.421529 ┆ 0.502933 │\n", + "│ 0.329007 ┆ 0.470936 ┆ -1.457254 ┆ 0.682447 │\n", + "│ -0.576503 ┆ 0.492678 ┆ -0.635811 ┆ 1.311106 │\n", + "│ 0.510897 ┆ 1.906728 ┆ -1.433384 ┆ 0.310135 │\n", + "│ 0.486463 ┆ -1.112442 ┆ 1.686876 ┆ -1.668481 │\n", + "│ 0.670362 ┆ 0.81884 ┆ -0.780455 ┆ 0.777569 │\n", + "└───────────┴───────────┴───────────┴───────────┘" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import polars as pl\n", "import numpy as np\n", - "import pandas as pd\n", "\n", - "df = pd.DataFrame(np.random.normal(size=(6, 4)), columns=[\"a\", \"b\", \"c\", \"d\"])\n", + "df = pl.DataFrame(np.random.normal(size=(6, 4)), schema=[\"a\", \"b\", \"c\", \"d\"])\n", "df" ] }, @@ -491,27 +687,90 @@ "id": "0e826ad5", "metadata": {}, "source": [ - "**pandas** has many built-in functions that are already built to iterate over rows and columns; for example, to compute the median of rows or columns respectively:" + "**polars** has built-in expressions designed to operate over columns and rows. For example, to compute the median:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "ac909c2f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 4)
abcd
f64f64f64f64
0.4077350.481807-0.7081330.59269
" + ], + "text/plain": [ + "shape: (1, 4)\n", + "┌──────────┬──────────┬───────────┬─────────┐\n", + "│ a ┆ b ┆ c ┆ d │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════╪══════════╪═══════════╪═════════╡\n", + "│ 0.407735 ┆ 0.481807 ┆ -0.708133 ┆ 0.59269 │\n", + "└──────────┴──────────┴───────────┴─────────┘" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df.median(axis=\"rows\") # can also use axis=1" + "df.select(pl.all().median())" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "96426002", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (6, 1)
row_median
f64
0.114185
0.399972
-0.041912
0.410516
-0.31299
0.723965
" + ], + "text/plain": [ + "shape: (6, 1)\n", + "┌────────────┐\n", + "│ row_median │\n", + "│ --- │\n", + "│ f64 │\n", + "╞════════════╡\n", + "│ 0.114185 │\n", + "│ 0.399972 │\n", + "│ -0.041912 │\n", + "│ 0.410516 │\n", + "│ -0.31299 │\n", + "│ 0.723965 │\n", + "└────────────┘" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df.median(axis=\"columns\") # can also use axis=0" + "df.select(\n", + " pl.concat_list(pl.all()).list.median().alias(\"row_median\")\n", + ")" ] }, { @@ -524,19 +783,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "060b6815", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "247 μs ± 1.65 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + ] + } + ], "source": [ "# Do not do this!\n", "\n", - "\n", "def add_five_slow(df):\n", " for i in range(len(df)):\n", " for j in range(len(df.columns)):\n", - " df.iloc[i, j] = df.iloc[i, j] + 5\n", - "\n", + " df[i, j] = df[i, j] + 5\n", "\n", "%timeit add_five_slow(df)" ] @@ -546,15 +811,23 @@ "id": "8246132e", "metadata": {}, "source": [ - "But to do this, every individual cell must be accessed and operated on—so it is very slow, taking milliseconds. **pandas** has far faster ways of performing the same operation. For simple operations on data frames with consistent type, you can simply add five to the whole data frame:" + "But to do this, every individual cell must be accessed and operated on—so it is very slow, taking milliseconds. **polars** has far faster ways of performing the same operation. For simple operations on data frames with consistent type, you can simply add five to the whole data frame:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "1a48ae52", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "52.8 μs ± 1.79 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" + ] + } + ], "source": [ "%timeit df + 5" ] @@ -572,19 +845,19 @@ "id": "7313616e", "metadata": {}, "source": [ - "This also works on a per column basis, so you can do `df[\"a\"] = df[\"a\"] + 5` and so on.\n", + "This also works on a per column basis, so you can do `df.with_columns(pl.col(\"a\") + 5)` and so on.\n", "\n", - "These operations have equivalents using the `assign()` operator, which allows for *method chaining*; stringing multiple operations together. The `assign()` operator version of `df[\"new_a\"] = df[\"a\"] + 5` would be" + "These operations have equivalents using method chaining; stringing multiple operations together. The version of `df.with_columns(new_a = pl.col(\"a\") + 5)` would be:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "f7391dc5", "metadata": {}, "outputs": [], "source": [ - "df = df.assign(new_a=lambda x: x[\"a\"] + 5)" + "df = df.with_columns(new_a = pl.col(\"a\") + 5)" ] }, { @@ -592,19 +865,66 @@ "id": "76aec162", "metadata": {}, "source": [ - "### Apply\n", + "### Expressions (Polars' Alternative to apply)\n", "\n", - "What happens if you have a more complicated function you want to iterate over? This is where **pandas**' `apply()` comes in, and can be used with assignment. `apply()` can also be used across rows or columns. Like `assign()`, it can be combined with a lambda function and used with either the whole data frame or just a column (in which case no need to specify `axis=`)." + "What happens if you have a more complicated operation you want to perform? In pandas, you might reach for `apply()`. In **polars**, you almost never need an equivalent because its expression API is incredibly expressive.\n", + "\n", + "Most \"complicated\" operations can be expressed directly using **polars'** built-in expressions:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "31adcb3f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (6, 6)
abcdnew_aresult
f64f64f64f64f64f64
40554.72543640554.21243840556.42152940555.50293340559.725436-7.674924
40555.32900740555.47093640553.54274640555.68244740560.329007-2.933522
40554.42349740555.49267840554.36418940556.31110640559.423497-4.638827
40555.51089740556.90672840553.56661640555.31013540560.510897-1.339664
40555.48646340553.88755840556.68687640553.33151940560.486463-7.504234
40555.67036240555.8188440554.21954540555.77756940560.670362-2.921115
" + ], + "text/plain": [ + "shape: (6, 6)\n", + "┌──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬───────────┐\n", + "│ a ┆ b ┆ c ┆ d ┆ new_a ┆ result │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════════╪══════════════╪══════════════╪══════════════╪══════════════╪═══════════╡\n", + "│ 40554.725436 ┆ 40554.212438 ┆ 40556.421529 ┆ 40555.502933 ┆ 40559.725436 ┆ -7.674924 │\n", + "│ 40555.329007 ┆ 40555.470936 ┆ 40553.542746 ┆ 40555.682447 ┆ 40560.329007 ┆ -2.933522 │\n", + "│ 40554.423497 ┆ 40555.492678 ┆ 40554.364189 ┆ 40556.311106 ┆ 40559.423497 ┆ -4.638827 │\n", + "│ 40555.510897 ┆ 40556.906728 ┆ 40553.566616 ┆ 40555.310135 ┆ 40560.510897 ┆ -1.339664 │\n", + "│ 40555.486463 ┆ 40553.887558 ┆ 40556.686876 ┆ 40553.331519 ┆ 40560.486463 ┆ -7.504234 │\n", + "│ 40555.670362 ┆ 40555.81884 ┆ 40554.219545 ┆ 40555.777569 ┆ 40560.670362 ┆ -2.921115 │\n", + "└──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴───────────┘" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df.apply(lambda x: x[\"a\"] - x[\"new_a\"].mean() * x[\"c\"] / x[\"b\"], axis=1)" + "# Don't do this (slow, row-wise)\n", + "mean_new_a = df.select(pl.col(\"new_a\").mean()).item()\n", + "df.with_columns(\n", + " result = pl.struct([\"a\", \"b\", \"c\"]).map_elements(\n", + " lambda x: x[\"a\"] - mean_new_a * x[\"c\"] / x[\"b\"],\n", + " return_dtype=pl.Float64\n", + " )\n", + ")\n", + "\n", + "# Do this instead (fast, vectorized)\n", + "df.with_columns(\n", + " result = pl.col(\"a\") - pl.col(\"new_a\").mean() * pl.col(\"c\") / pl.col(\"b\")\n", + ")" ] }, { @@ -612,24 +932,7 @@ "id": "78b558f4", "metadata": {}, "source": [ - "Note that this is just an example: you could still do this entire operation without using apply! But you will sometimes find yourself with cases where you do need to use it.\n", - "\n", - "Apply also works with functions, including user-defined functions:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "136d435d", - "metadata": {}, - "outputs": [], - "source": [ - "def complicated_function(x):\n", - " return x - x.mean()\n", - "\n", - "\n", - "df = df.apply(complicated_function, axis=1)\n", - "df" + "The first expression would work, but it evaluates the computation row by row using a **python lambda**, which is slow and prevents **polars** from optimizing the query. The second approach uses native expressions, allowing **polars** to execute the computation efficiently in a fully vectorized and optimized manner." ] }, { @@ -637,19 +940,52 @@ "id": "171be2c9", "metadata": {}, "source": [ - "### Eval(uate)\n", - "\n", - "`eval()` evaluates a string describing operations on DataFrame columns to create new columns. It operates on columns only, not rows or elements. Here's an example:" + "In **polars**, there's no `eval()` — you use expressions directly instead:\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "8d9defd3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (6, 6)
abcdnew_aratio
f64f64f64f64f64f64
40554.72543640554.21243840556.42152940555.50293340559.7254360.999877
40555.32900740555.47093640553.54274640555.68244740560.3290070.999877
40554.42349740555.49267840554.36418940556.31110640559.4234970.999877
40555.51089740556.90672840553.56661640555.31013540560.5108970.999877
40555.48646340553.88755840556.68687640553.33151940560.4864630.999877
40555.67036240555.8188440554.21954540555.77756940560.6703620.999877
" + ], + "text/plain": [ + "shape: (6, 6)\n", + "┌──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────┐\n", + "│ a ┆ b ┆ c ┆ d ┆ new_a ┆ ratio │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════════╪══════════════╪══════════════╪══════════════╪══════════════╪══════════╡\n", + "│ 40554.725436 ┆ 40554.212438 ┆ 40556.421529 ┆ 40555.502933 ┆ 40559.725436 ┆ 0.999877 │\n", + "│ 40555.329007 ┆ 40555.470936 ┆ 40553.542746 ┆ 40555.682447 ┆ 40560.329007 ┆ 0.999877 │\n", + "│ 40554.423497 ┆ 40555.492678 ┆ 40554.364189 ┆ 40556.311106 ┆ 40559.423497 ┆ 0.999877 │\n", + "│ 40555.510897 ┆ 40556.906728 ┆ 40553.566616 ┆ 40555.310135 ┆ 40560.510897 ┆ 0.999877 │\n", + "│ 40555.486463 ┆ 40553.887558 ┆ 40556.686876 ┆ 40553.331519 ┆ 40560.486463 ┆ 0.999877 │\n", + "│ 40555.670362 ┆ 40555.81884 ┆ 40554.219545 ┆ 40555.777569 ┆ 40560.670362 ┆ 0.999877 │\n", + "└──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴──────────┘" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df[\"ratio\"] = df.eval(\"a / new_a\")\n", + "df = df.with_columns(\n", + " (pl.col(\"a\") / pl.col(\"new_a\")).alias(\"ratio\")\n", + ")\n", "df" ] }, @@ -658,7 +994,53 @@ "id": "8b275b5b", "metadata": {}, "source": [ - "Evaluate can also be used to create new boolean columns using, for example, a string `\"a > 0.5\"` in the above example." + "You can also create boolean columns the same way:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "f301c8cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (6, 7)
abcdnew_aratioa_gt_0.5
f64f64f64f64f64f64bool
40554.72543640554.21243840556.42152940555.50293340559.7254360.999877true
40555.32900740555.47093640553.54274640555.68244740560.3290070.999877true
40554.42349740555.49267840554.36418940556.31110640559.4234970.999877true
40555.51089740556.90672840553.56661640555.31013540560.5108970.999877true
40555.48646340553.88755840556.68687640553.33151940560.4864630.999877true
40555.67036240555.8188440554.21954540555.77756940560.6703620.999877true
" + ], + "text/plain": [ + "shape: (6, 7)\n", + "┌──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────┬──────────┐\n", + "│ a ┆ b ┆ c ┆ d ┆ new_a ┆ ratio ┆ a_gt_0.5 │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ bool │\n", + "╞══════════════╪══════════════╪══════════════╪══════════════╪══════════════╪══════════╪══════════╡\n", + "│ 40554.725436 ┆ 40554.212438 ┆ 40556.421529 ┆ 40555.502933 ┆ 40559.725436 ┆ 0.999877 ┆ true │\n", + "│ 40555.329007 ┆ 40555.470936 ┆ 40553.542746 ┆ 40555.682447 ┆ 40560.329007 ┆ 0.999877 ┆ true │\n", + "│ 40554.423497 ┆ 40555.492678 ┆ 40554.364189 ┆ 40556.311106 ┆ 40559.423497 ┆ 0.999877 ┆ true │\n", + "│ 40555.510897 ┆ 40556.906728 ┆ 40553.566616 ┆ 40555.310135 ┆ 40560.510897 ┆ 0.999877 ┆ true │\n", + "│ 40555.486463 ┆ 40553.887558 ┆ 40556.686876 ┆ 40553.331519 ┆ 40560.486463 ┆ 0.999877 ┆ true │\n", + "│ 40555.670362 ┆ 40555.81884 ┆ 40554.219545 ┆ 40555.777569 ┆ 40560.670362 ┆ 0.999877 ┆ true │\n", + "└──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴──────────┴──────────┘" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.with_columns(\n", + " (pl.col(\"a\") > 0.5).alias(\"a_gt_0.5\")\n", + ")\n", + "df" ] } ], From b040c93fc72fd3625939fabd502119f8cb05c762 Mon Sep 17 00:00:00 2001 From: Nwabueze Ugoh Date: Tue, 28 Apr 2026 12:14:31 +0100 Subject: [PATCH 2/4] Merge branch 'main' of https://github.com/datathink/python4DSpolars into brent-programme --- visualise.quarto_ipynb_1 | 136 ---------------- workflow-help.quarto_ipynb_1 | 115 -------------- ...w-packages-and-environments.quarto_ipynb_1 | 149 ------------------ 3 files changed, 400 deletions(-) delete mode 100644 visualise.quarto_ipynb_1 delete mode 100644 workflow-help.quarto_ipynb_1 delete mode 100644 workflow-packages-and-environments.quarto_ipynb_1 diff --git a/visualise.quarto_ipynb_1 b/visualise.quarto_ipynb_1 deleted file mode 100644 index 2ef6d2f..0000000 --- a/visualise.quarto_ipynb_1 +++ /dev/null @@ -1,136 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Visualisation {#sec-visualise}\n", - "\n", - "After reading the first part of the book, you understand the basics of the most important tools for doing data science. Now it’s time to start diving into the details. In this part of the book, you’ll learn about visualising data in further depth (in @sec-vis-layers), and get further stuck into the details of the different kinds of data visualisation (in @sec-exploratory-data-analysis and @sec-communicate-plots). In this short chapter, we discuss the different ways to create visualisations, and the different purposes of visualisations.\n", - "\n", - "## Philosophies of data visualisation\n", - "\n", - "There are broadly two categories of approach to using code to create data visualisations: *imperative* (build what you want from individual elements) and *declarative* (say what you want from a list of pre-existing options). Choosing which to use involves a trade-off: imperative libraries offer you flexibility but at the cost of some verbosity; declarative libraries offer you a quick way to plot your data, but only if it’s in the right format to begin with, and customisation to special chart types is more difficult.\n", - "\n", - "Python has many excellent plotting packages, including perhaps the most powerful imperative plotting package around, **matplotlib**, and an amazing declarative library that we already saw, **lets-plot**. These two libraries will get you a long way, and each could be worthy of an entire book themselves. Fortunately for us, though, we can do 95% of what we need with a small number of commands from one or the other of them. In general, to keep this book as light as possible, we've opted to use **lets-plot** wherever possible—and @sec-vis-layers is going to take you on a more in-depth tour of how to use it yourself.\n", - "\n", - "## Purposes of data visualisation\n", - "\n", - "Data visualisation has all kinds of different purposes. It can be useful to bear in mind three broad categories of visualisation that are out there:\n", - "\n", - "- exploratory\n", - "- scientific\n", - "- narrative\n", - "\n", - "Let's look at each in a bit more detail.\n", - "\n", - "### Exploratory Data Viz\n", - "\n", - "The first of the three kinds is *exploratory data visualisation*, and it's the kind that you do when you're looking and data and trying to understand it. Just plotting the data is a really good strategy for getting a feel for any issues there might be. This is perhaps most famously demonstrated by Anscombe's quartet: four different datasets with the same mean, standard deviation, and correlation but very different data distributions." - ], - "id": "f3331573" - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#| echo: false\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import matplotlib_inline.backend_inline\n", - "\n", - "# Plot settings\n", - "plt.style.use(\"https://github.com/aeturrell/python4DS/raw/main/plot_style.txt\")\n", - "matplotlib_inline.backend_inline.set_matplotlib_formats(\"svg\")\n", - "\n", - "# Set max rows displayed for readability\n", - "pd.set_option(\"display.max_rows\", 6)\n", - "\n", - "x = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]\n", - "y1 = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]\n", - "y2 = [9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74]\n", - "y3 = [7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73]\n", - "x4 = [8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8]\n", - "y4 = [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89]\n", - "\n", - "datasets = {\"I\": (x, y1), \"II\": (x, y2), \"III\": (x, y3), \"IV\": (x4, y4)}\n", - "\n", - "fig, axs = plt.subplots(\n", - " 2,\n", - " 2,\n", - " sharex=True,\n", - " sharey=True,\n", - " figsize=(10, 6),\n", - " gridspec_kw={\"wspace\": 0.08, \"hspace\": 0.08},\n", - ")\n", - "axs[0, 0].set(xlim=(0, 20), ylim=(2, 14))\n", - "axs[0, 0].set(xticks=(0, 10, 20), yticks=(4, 8, 12))\n", - "\n", - "for ax, (label, (x, y)) in zip(axs.flat, datasets.items()):\n", - " ax.text(0.1, 0.9, label, fontsize=20, transform=ax.transAxes, va=\"top\")\n", - " ax.tick_params(direction=\"in\", top=True, right=True)\n", - " ax.plot(x, y, \"o\")\n", - "\n", - " # linear regression\n", - " p1, p0 = np.polyfit(x, y, deg=1) # slope, intercept\n", - " ax.axline(xy1=(0, p0), slope=p1, color=\"r\", lw=2)\n", - "\n", - " # add text box for the statistics\n", - " stats = (\n", - " f\"$\\\\mu$ = {np.mean(y):.2f}\\n\"\n", - " f\"$\\\\sigma$ = {np.std(y):.2f}\\n\"\n", - " f\"$r$ = {np.corrcoef(x, y)[0][1]:.2f}\"\n", - " )\n", - " bbox = dict(boxstyle=\"round\", fc=\"blanchedalmond\", ec=\"orange\", alpha=0.5)\n", - " ax.text(\n", - " 0.95,\n", - " 0.07,\n", - " stats,\n", - " fontsize=9,\n", - " bbox=bbox,\n", - " transform=ax.transAxes,\n", - " horizontalalignment=\"right\",\n", - " )\n", - "\n", - "plt.suptitle(\"Anscombe's Quartet\")\n", - "plt.show()" - ], - "id": "64a0e7f6", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Exploratory visualisation is usually quick and dirty, and flexible too. Some exploratory data viz can be automated, and there's a whole host of packages to help with this, including [**skimpy**](https://aeturrell.github.io/skimpy/).\n", - "\n", - "Beyond you and perhaps your co-authors/collaborators, however, not many other people should be seeing your exploratory visualisation! They will typically be worked up quickly, be numerous, and be throw-away. We'll look more at this in @sec-exploratory-data-analysis.\n", - "\n", - "### Scientific Data Viz\n", - "\n", - "The second kind, scientific data visualisation, is the prime cut of your exploratory visualisation. It's the kind of plot you might include in a more technical paper, the picture that says a thousand words. I often think of the first image of a black hole @akiyama2019first as a prime example of this. You can get away with having a high density of information in a scientific plot and, in short format journals, you may need to. The journal Physical Review Letters, which has an 8 page limit, has a classic of this genre in more or less every issue. Ensuring that important values can be accurately read from the plot is especially important in these kinds of charts. But they can also be the kind of plot that presents the killer results in a study; they might not be exciting to people who don't look at charts for a living, but they might be exciting and, just as importantly, understandable by your peers.\n", - "\n", - "This type of visualisation is especially popular in the big science journals like *Nature* and *Science*, where space is at a premium. We won't cover this type of plot in this book, because it tends to be very bespoke.\n", - "\n", - "### Narrative Data Viz\n", - "\n", - "The third and final kind is narrative data visualisation. This is the one that requires the most thought in the step where you go from the first view to the end product. It's a visualisation that doesn't just show a picture, but gives an insight. These are the kind of visualisations that you might see in the *Financial Times*, *The Economist*, or on the *BBC News* website. They come with aids that help the viewer focus on the aspects that the creator wanted them to (you can think of these aids or focuses as doing for visualisation what bold font does for text). They're well worth using in your work, especially if you're trying to communicate a particular narrative, and especially if the people you're communicating with don't have deep knowledge of the topic. You might use them in a paper that you hope will have a wide readership, in a blog post summarising your work, or in a report intended for a policymaker.\n", - "\n", - "You can find more information on the topic of communicating via data visualisations in the @sec-communicate-plots chapter." - ], - "id": "30b9ff30" - } - ], - "metadata": { - "kernelspec": { - "name": "python3", - "language": "python", - "display_name": "Python 3 (ipykernel)", - "path": "/Users/omagic/Documents/GitHub/python4DSpolars/.venv/share/jupyter/kernels/python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/workflow-help.quarto_ipynb_1 b/workflow-help.quarto_ipynb_1 deleted file mode 100644 index e7bebf8..0000000 --- a/workflow-help.quarto_ipynb_1 +++ /dev/null @@ -1,115 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Postscript: Getting Further Help {#sec-workflow-help}\n", - "\n", - "This book is not an island; there is no single resource that will allow you to master Python for Data Science. As you begin to apply the techniques described in this book to your own data, you will soon find questions that we do not answer. This section describes a few tips on how to get help, and to help you keep learning.\n", - "\n", - "## Resources\n", - "\n", - "Some other resources for learning are:\n", - "\n", - "- [The Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/)\n", - "- [Real Python](https://realpython.com/), which has excellent short tutorials that cover Python more broadly (not just data science)\n", - "- [freeCodeCamp's Python courses](https://www.freecodecamp.org/news/search?query=data%20science%20python), though take care to select one that's at the right level for you\n", - "- [Coding for Economists](https://aeturrell.github.io/coding-for-economists), which has similar content to this book but is more in depth and aimed at analysts (particularly in economics)\n", - "\n", - "## Google is your friend\n", - "\n", - "If you get stuck, start with Google. Typically adding \"Python\" or \"Python Data Science\" (as the Python ecosystem goes *well* beyond data science) to a query is enough to restrict it to relevant results. Google is particularly useful for error messages. If you get an error message and you have no idea what it means, try googling it! Chances are that someone else has been confused by it in the past, and there will be help somewhere on the web.\n", - "\n", - "If Google doesn't help, try [Stack Overflow](http://stackoverflow.com). Start by spending a little time searching for an existing answer, including `[Python]` to restrict your search to questions and answers that use Python.\n", - "\n", - "## In the loop\n", - "\n", - "It's also helpful to keep an eye on the latest developments in data science. There are tons of data science newsletters out there, and we recommend keeping up with the Python data science community by following the (#pydata), (#datascience), and (#python) hashtags on Twitter.\n", - "\n", - "## Making a reprex (reproducible example)\n", - "\n", - "If your googling doesn't find anything useful, it's a really good idea prepare a minimal reproducible example or **reprex**.\n", - "\n", - "A good reprex makes it easier for other people to help you, and often you'll figure out the problem yourself in the course of making it. There are two parts to creating a reprex:\n", - "\n", - "- First, you need to make your code reproducible. This means that you need to capture everything, i.e., include any packages you used and create all necessary objects. The easiest way to make sure you've done this is to use the [**watermark**](https://github.com/rasbt/watermark) package alongside whatever else you are doing:" - ], - "id": "22b3f9e0" - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import watermark.watermark as watermark\n", - "\n", - "print(watermark())\n", - "print(watermark(iversions=True, globals_=globals()))" - ], - "id": "a119501b", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Second, you need to make it minimal. Strip away everything that is not directly related to your problem. This usually involves creating a much smaller and simpler Python object than the one you're facing in real life or even using built-in data.\n", - "\n", - "That sounds like a lot of work! And it can be, but it has a great payoff:\n", - "\n", - "- 80% of the time creating an excellent reprex reveals the source of your problem. It's amazing how often the process of writing up a self-contained and minimal example allows you to answer your own question.\n", - "\n", - "- The other 20% of time you will have captured the essence of your problem in a way that is easy for others to play with. This substantially improves your chances of getting help.\n", - "\n", - "There are several things you need to include to make your example reproducible: Python environment, required packages, data, and code.\n", - "\n", - "- **Python environment**--really just the Python version. This is covered by the first call to the **watermark** package.\n", - "\n", - "- **Packages** and their versions. These should be loaded at the top of the script, so it's easy to see which ones the example needs. By using **watermark** with the above configuration, you will also print the package versions. This is a good time to check that you're using the latest version of each package; it's possible you've discovered a bug that's been fixed since you installed or last updated the package.\n", - "\n", - "- **Data**: as others won't be able to easily download the data you're working with, it's often best to create a small amount of data from code that still have the same problem as you're finding with your actual data. Between **numpy** and **pandas**, it's quite easy to generate data from code; here's an example:" - ], - "id": "c4ac60b4" - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "df = pd.DataFrame(\n", - " data=np.reshape(range(36), (6, 6)),\n", - " index=[\"a\", \"b\", \"c\", \"d\", \"e\", \"f\"],\n", - " columns=[\"col\" + str(i) for i in range(6)],\n", - " dtype=float,\n", - ")\n", - "df[\"random_normal\"] = np.random.normal(size=6)\n", - "df" - ], - "id": "d1e4562c", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- **Code**: copy and paste the minimal reproducible example code (including the packages, as noted above). Make sure you've used spaces and your variable names are concise, yet informative. Use comments to indicate where your problem lies. Do your best to remove everything that is not related to the problem. Finally, the shorter your code is, the easier it is to understand, and the easier it is to fix.\n", - "\n", - "Finish by checking that you have actually made a reproducible example by starting a fresh Python session and copying and pasting your reprex in." - ], - "id": "4b75e409" - } - ], - "metadata": { - "kernelspec": { - "name": "python3", - "language": "python", - "display_name": "Python 3 (ipykernel)", - "path": "/Users/omagic/Documents/GitHub/python4DSpolars/.venv/share/jupyter/kernels/python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/workflow-packages-and-environments.quarto_ipynb_1 b/workflow-packages-and-environments.quarto_ipynb_1 deleted file mode 100644 index a5600ce..0000000 --- a/workflow-packages-and-environments.quarto_ipynb_1 +++ /dev/null @@ -1,149 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Workflow: Packages and Environments {#sec-workflow-packages-and-environments}\n", - "\n", - "In this chapter, you're going to learn about packages and how to install them plus virtual coding environments that keep your packages isolated and your projects reproducible.\n", - "\n", - "## Packages\n", - "\n", - "### Introduction\n", - "\n", - "Packages (also called libraries) are key to extending the functionality of Python. It won't be long before you'll need to install some. There are packages for geoscience, for building websites, for analysing genetic data, for economics—pretty much for anything you can think of. Packages are typically not written by the core maintainers of the Python language but by enthusiasts, firms, researchers, academics, all sorts! Because anyone can write packages, they vary widely in their quality and usefulness. There are some that you'll be seeing them again and again.\n", - "\n", - "

Name a more iconic trio, I'll wait. pic.twitter.com/pGaLuUxQ3r

— Vicki Boykis (\\@vboykis) August 23, 2018
\n", - "\n", - "The three Python packages **numpy**, **pandas**, and **maplotlib**, which respectively provide numerical, data analysis, and plotting functionality, are ubiquitous. So many scripts begin by importing all three of them, as in the tweet above!\n", - "\n", - "There are typically two steps to using a new Python package:\n", - "\n", - "1. *install* the package on the command line (aka the terminal), eg using `uv add pandas`\n", - "\n", - "2. *import* the package into your Python session, eg using `import pandas as pd`\n", - "\n", - "When you issue an install command for a specific package, it is automatically downloaded from the internet and installed in the appropriate place on your computer. To install extra Python packages, you issue install commands to a text-based window called the \"terminal\".\n", - "\n", - "### The Command Line in Brief\n", - "\n", - "The *terminal* or *command line* or sometimes the *command prompt* was labelled 4 in the screenshot of Visual Studio Code from the chapter on @sec-introduction. The terminal is a text-based way to issue all kinds of commands to your computer (not just Python commands) and knowing a little bit about it is really useful for coding (and more) because managing packages, environments (which we haven't yet discussed), and version control (ditto) can all be done via the terminal. We'll come to these in due course in the chapter on @sec-command-line, but for now, a little background on what the terminal is and what it does.\n", - "\n", - "::: {.callout-note}\n", - "To open up the command line within Visual Studio Code, use the + \\` keyboard shortcut (Mac) or ctrl + \\` (Windows/Linux), or click \"View > Terminal\".\n", - "\n", - "If you want to open up the command line independently of Visual Studio Code, search for \"Terminal\" on Mac and Linux, and \"Powershell\" on Windows.\n", - ":::\n", - "\n", - "Firstly, everything you can do by clicking on icons to launch programmes on your computer, you can also do via the terminal, also known as the command line. For many programmes, a lot of their functionality can be accessed using the command line, and other programmes *only* have a command line interface (CLI), including some that are used for data science.\n", - "\n", - "::: {.callout-tip}\n", - "The command line interacts with your operating system and is used to create, activate, or change Python installations.\n", - ":::\n", - "\n", - "Use Visual Studio Code to open a terminal window by clicking Terminal -> New Terminal on the list of commands at the very top of the window. If you have installed uv on your computer, your terminal should look something like this as your 'command prompt':\n", - "\n", - "```bash\n", - "your-username@your-computer current-directory %\n", - "```\n", - "\n", - "on Mac, and the same but with '%' replaced by '$' on linux, and (using Powershell)\n", - "\n", - "```powershell\n", - "PS C:\\Windows\\System32>\n", - "```\n", - "\n", - "on Windows.\n", - "\n", - "You can check that uv has successfully installed Python in your current project's folder by running\n", - "\n", - "```bash\n", - "uv run python --version\n", - "```\n", - "\n", - "For now, to at least try out the command line, let's use something that works across all three of the major operating systems. Type `uv run python` on the command prompt that came up in your new terminal window. You should see information about your installation of Python appear, including the version, followed by a Python prompt that looks like `>>>`. This is a kind of interactive Python session, in the terminal. It's much less rich than the one available in Visual Studio Code (it can't run scripts line-by-line, for example) but you can try `print('Hello World!')` and it will run, printing your message. To exit the terminal-based Python session, type `exit()` to go back to the regular command line.\n", - "\n", - "### Installing Packages\n", - "\n", - "To install extra Python packages, the default and easiest way is to use `uv add **packagename**`. There are over 330,000 Python packages on PyPI (the Python Package Index)! You can see what packages you have installed already by running `uv pip list` into the command line.\n", - "\n", - "`uv add ...` will install packages into the special Python environment in your current folder (it sits in a subdirectory called \".venv\" which will be hidden by default on most systems.) It's really helpful and good practice to have one Python environment per project, and **uv** does this automatically for you.\n", - "\n", - "::: {.callout-tip title=\"Exercise\"}\n", - "Try installing the **matplotlib**, **pandas**, **statsmodels**, and **skimpy** packages using `uv add`.\n", - ":::\n", - "\n", - "### Using Packages\n", - "\n", - "Once you have installed a package, you need to be able to use it! This is usually done via an import statement at the top of your script or Jupyter Notebook. For example, to bring in **pandas**, it's\n", - "\n", - "```python\n", - "import pandas as pd\n", - "```\n", - "\n", - "Why does Python do this? The idea of not just loading every package is to provide clarity over what function is being called from what package. It's also not necessary to load every package for every piece of analysis, and you often actually want to know what the *minimum* set of packages is to reproduce an analysis. Making the package imports explicit helps with all of that.\n", - "\n", - "You may also wonder why one doesn't just use `import pandas as pandas`. There's actually nothing stopping you doing this except i) it's convenient to have a shorter name and ii) there does tend to be a convention around imports, ie `pd` for **pandas** and `np` for **numpy**, and your code will be clearer to yourself and others if you follow the conventions.\n", - "\n", - "## Virtual Code Environments\n", - "\n", - "Virtual code environments allow you to isolate all of the packages that you're using to do analysis for one project from the set of packages you might need for a different project. They're an important part of creating a reproducible analytical pipeline but a key benefit is that others can reproduce the environment you used and it's best practice to have an isolated environment per project.\n", - "\n", - "To be more concrete, let's say you're using Python 3.9, **statsmodels**, and **pandas** for one project, project A. And, for project B, you need to use Python 3.10 with **numpy** and **scikit-learn**. Even with the same version of Python, best practice would be to have two separate virtual Python environments: environment A, with everything needed for project A, and environment B, with everything needed for project B. For the case where you're using different versions of Python, this isn't just best practice, it's essential.\n", - "\n", - "Many programming languages now come with an option to install packages and a version of the language in isolated environments. In Python, there are multiple tools for managing different environments. And, of those, the easiest to work with is probably [**uv**](https://docs.astral.sh/uv/).\n", - "\n", - "You can see all of the packages in the environment created in your current folder by running `uv pip list` on the command line. Here's an example of looking at the installed packages within this very book, filtering them just to the ones beginning with \"s\".\n", - "\n", - "```{bash}\n", - "uv run pip list | grep ^s\n", - "```\n", - "\n", - "### The pyproject.toml file in Python Environments\n", - "\n", - "You may have noticed that a file called `pyproject.toml` has been created." - ], - "id": "8b889898" - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "import toml\n", - "from rich import print_json\n", - "\n", - "print_json(data=toml.load(\"pyproject.toml\"))" - ], - "id": "688f09f1", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This lists all of the dependencies, and the version, of a **uv** Python project. There are lots of benefits to tracking what versions of packages you're using like this. One of the most important is that you can *share* projects with other people, and they can install them from these files too.\n", - "\n", - "As you install or remove packages, the `pyproject.toml` file changes in lockstep.\n", - "\n", - "Noe that Visual Studio Code shows which Python environment you are using when you open a Python script or Jupyter Notebook.\n", - "\n", - "![A typical user view in Visual Studio Code](https://github.com/aeturrell/coding-for-economists/blob/main/img/vscode_layout.png?raw=true)\n", - "\n", - "In the screenshot above, you can see the project-environment in two places: on the blue bar at the bottom of the screen, and (in 5), at the top right hand side of the interactive window. A similar top right indicator is present when you have a Jupyter Notebook open too." - ], - "id": "148595b3" - } - ], - "metadata": { - "kernelspec": { - "name": "python3", - "language": "python", - "display_name": "Python 3 (ipykernel)", - "path": "/Users/omagic/Documents/GitHub/python4DSpolars/.venv/share/jupyter/kernels/python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file From 619a58e8aea6e988babcf466e5f0acbd06c2dda9 Mon Sep 17 00:00:00 2001 From: Nwabueze Ugoh Date: Tue, 28 Apr 2026 12:14:49 +0100 Subject: [PATCH 3/4] Refactor iteration.ipynb to enhance clarity by updating output displays, adjusting execution counts, and ensuring consistent references to polars throughout the notebook. --- visualise.quarto_ipynb_1 | 136 ++++++++++++++++ workflow-help.quarto_ipynb_1 | 115 ++++++++++++++ ...w-packages-and-environments.quarto_ipynb_1 | 149 ++++++++++++++++++ 3 files changed, 400 insertions(+) create mode 100644 visualise.quarto_ipynb_1 create mode 100644 workflow-help.quarto_ipynb_1 create mode 100644 workflow-packages-and-environments.quarto_ipynb_1 diff --git a/visualise.quarto_ipynb_1 b/visualise.quarto_ipynb_1 new file mode 100644 index 0000000..ceccefe --- /dev/null +++ b/visualise.quarto_ipynb_1 @@ -0,0 +1,136 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualisation {#sec-visualise}\n", + "\n", + "After reading the first part of the book, you understand the basics of the most important tools for doing data science. Now it’s time to start diving into the details. In this part of the book, you’ll learn about visualising data in further depth (in @sec-vis-layers), and get further stuck into the details of the different kinds of data visualisation (in @sec-exploratory-data-analysis and @sec-communicate-plots). In this short chapter, we discuss the different ways to create visualisations, and the different purposes of visualisations.\n", + "\n", + "## Philosophies of data visualisation\n", + "\n", + "There are broadly two categories of approach to using code to create data visualisations: *imperative* (build what you want from individual elements) and *declarative* (say what you want from a list of pre-existing options). Choosing which to use involves a trade-off: imperative libraries offer you flexibility but at the cost of some verbosity; declarative libraries offer you a quick way to plot your data, but only if it’s in the right format to begin with, and customisation to special chart types is more difficult.\n", + "\n", + "Python has many excellent plotting packages, including perhaps the most powerful imperative plotting package around, **matplotlib**, and an amazing declarative library that we already saw, **lets-plot**. These two libraries will get you a long way, and each could be worthy of an entire book themselves. Fortunately for us, though, we can do 95% of what we need with a small number of commands from one or the other of them. In general, to keep this book as light as possible, we've opted to use **lets-plot** wherever possible—and @sec-vis-layers is going to take you on a more in-depth tour of how to use it yourself.\n", + "\n", + "## Purposes of data visualisation\n", + "\n", + "Data visualisation has all kinds of different purposes. It can be useful to bear in mind three broad categories of visualisation that are out there:\n", + "\n", + "- exploratory\n", + "- scientific\n", + "- narrative\n", + "\n", + "Let's look at each in a bit more detail.\n", + "\n", + "### Exploratory Data Viz\n", + "\n", + "The first of the three kinds is *exploratory data visualisation*, and it's the kind that you do when you're looking and data and trying to understand it. Just plotting the data is a really good strategy for getting a feel for any issues there might be. This is perhaps most famously demonstrated by Anscombe's quartet: four different datasets with the same mean, standard deviation, and correlation but very different data distributions." + ], + "id": "e22704e5" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib_inline.backend_inline\n", + "\n", + "# Plot settings\n", + "plt.style.use(\"https://github.com/aeturrell/python4DS/raw/main/plot_style.txt\")\n", + "matplotlib_inline.backend_inline.set_matplotlib_formats(\"svg\")\n", + "\n", + "# Set max rows displayed for readability\n", + "pd.set_option(\"display.max_rows\", 6)\n", + "\n", + "x = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]\n", + "y1 = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]\n", + "y2 = [9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74]\n", + "y3 = [7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73]\n", + "x4 = [8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8]\n", + "y4 = [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89]\n", + "\n", + "datasets = {\"I\": (x, y1), \"II\": (x, y2), \"III\": (x, y3), \"IV\": (x4, y4)}\n", + "\n", + "fig, axs = plt.subplots(\n", + " 2,\n", + " 2,\n", + " sharex=True,\n", + " sharey=True,\n", + " figsize=(10, 6),\n", + " gridspec_kw={\"wspace\": 0.08, \"hspace\": 0.08},\n", + ")\n", + "axs[0, 0].set(xlim=(0, 20), ylim=(2, 14))\n", + "axs[0, 0].set(xticks=(0, 10, 20), yticks=(4, 8, 12))\n", + "\n", + "for ax, (label, (x, y)) in zip(axs.flat, datasets.items()):\n", + " ax.text(0.1, 0.9, label, fontsize=20, transform=ax.transAxes, va=\"top\")\n", + " ax.tick_params(direction=\"in\", top=True, right=True)\n", + " ax.plot(x, y, \"o\")\n", + "\n", + " # linear regression\n", + " p1, p0 = np.polyfit(x, y, deg=1) # slope, intercept\n", + " ax.axline(xy1=(0, p0), slope=p1, color=\"r\", lw=2)\n", + "\n", + " # add text box for the statistics\n", + " stats = (\n", + " f\"$\\\\mu$ = {np.mean(y):.2f}\\n\"\n", + " f\"$\\\\sigma$ = {np.std(y):.2f}\\n\"\n", + " f\"$r$ = {np.corrcoef(x, y)[0][1]:.2f}\"\n", + " )\n", + " bbox = dict(boxstyle=\"round\", fc=\"blanchedalmond\", ec=\"orange\", alpha=0.5)\n", + " ax.text(\n", + " 0.95,\n", + " 0.07,\n", + " stats,\n", + " fontsize=9,\n", + " bbox=bbox,\n", + " transform=ax.transAxes,\n", + " horizontalalignment=\"right\",\n", + " )\n", + "\n", + "plt.suptitle(\"Anscombe's Quartet\")\n", + "plt.show()" + ], + "id": "87f336c9", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Exploratory visualisation is usually quick and dirty, and flexible too. Some exploratory data viz can be automated, and there's a whole host of packages to help with this, including [**skimpy**](https://aeturrell.github.io/skimpy/).\n", + "\n", + "Beyond you and perhaps your co-authors/collaborators, however, not many other people should be seeing your exploratory visualisation! They will typically be worked up quickly, be numerous, and be throw-away. We'll look more at this in @sec-exploratory-data-analysis.\n", + "\n", + "### Scientific Data Viz\n", + "\n", + "The second kind, scientific data visualisation, is the prime cut of your exploratory visualisation. It's the kind of plot you might include in a more technical paper, the picture that says a thousand words. I often think of the first image of a black hole @akiyama2019first as a prime example of this. You can get away with having a high density of information in a scientific plot and, in short format journals, you may need to. The journal Physical Review Letters, which has an 8 page limit, has a classic of this genre in more or less every issue. Ensuring that important values can be accurately read from the plot is especially important in these kinds of charts. But they can also be the kind of plot that presents the killer results in a study; they might not be exciting to people who don't look at charts for a living, but they might be exciting and, just as importantly, understandable by your peers.\n", + "\n", + "This type of visualisation is especially popular in the big science journals like *Nature* and *Science*, where space is at a premium. We won't cover this type of plot in this book, because it tends to be very bespoke.\n", + "\n", + "### Narrative Data Viz\n", + "\n", + "The third and final kind is narrative data visualisation. This is the one that requires the most thought in the step where you go from the first view to the end product. It's a visualisation that doesn't just show a picture, but gives an insight. These are the kind of visualisations that you might see in the *Financial Times*, *The Economist*, or on the *BBC News* website. They come with aids that help the viewer focus on the aspects that the creator wanted them to (you can think of these aids or focuses as doing for visualisation what bold font does for text). They're well worth using in your work, especially if you're trying to communicate a particular narrative, and especially if the people you're communicating with don't have deep knowledge of the topic. You might use them in a paper that you hope will have a wide readership, in a blog post summarising your work, or in a report intended for a policymaker.\n", + "\n", + "You can find more information on the topic of communicating via data visualisations in the @sec-communicate-plots chapter." + ], + "id": "4a968c3b" + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "language": "python", + "display_name": "Python 3 (ipykernel)", + "path": "/Users/omagic/Documents/GitHub/python4DSpolars/.venv/share/jupyter/kernels/python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/workflow-help.quarto_ipynb_1 b/workflow-help.quarto_ipynb_1 new file mode 100644 index 0000000..18428db --- /dev/null +++ b/workflow-help.quarto_ipynb_1 @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Postscript: Getting Further Help {#sec-workflow-help}\n", + "\n", + "This book is not an island; there is no single resource that will allow you to master Python for Data Science. As you begin to apply the techniques described in this book to your own data, you will soon find questions that we do not answer. This section describes a few tips on how to get help, and to help you keep learning.\n", + "\n", + "## Resources\n", + "\n", + "Some other resources for learning are:\n", + "\n", + "- [The Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/)\n", + "- [Real Python](https://realpython.com/), which has excellent short tutorials that cover Python more broadly (not just data science)\n", + "- [freeCodeCamp's Python courses](https://www.freecodecamp.org/news/search?query=data%20science%20python), though take care to select one that's at the right level for you\n", + "- [Coding for Economists](https://aeturrell.github.io/coding-for-economists), which has similar content to this book but is more in depth and aimed at analysts (particularly in economics)\n", + "\n", + "## Google is your friend\n", + "\n", + "If you get stuck, start with Google. Typically adding \"Python\" or \"Python Data Science\" (as the Python ecosystem goes *well* beyond data science) to a query is enough to restrict it to relevant results. Google is particularly useful for error messages. If you get an error message and you have no idea what it means, try googling it! Chances are that someone else has been confused by it in the past, and there will be help somewhere on the web.\n", + "\n", + "If Google doesn't help, try [Stack Overflow](http://stackoverflow.com). Start by spending a little time searching for an existing answer, including `[Python]` to restrict your search to questions and answers that use Python.\n", + "\n", + "## In the loop\n", + "\n", + "It's also helpful to keep an eye on the latest developments in data science. There are tons of data science newsletters out there, and we recommend keeping up with the Python data science community by following the (#pydata), (#datascience), and (#python) hashtags on Twitter.\n", + "\n", + "## Making a reprex (reproducible example)\n", + "\n", + "If your googling doesn't find anything useful, it's a really good idea prepare a minimal reproducible example or **reprex**.\n", + "\n", + "A good reprex makes it easier for other people to help you, and often you'll figure out the problem yourself in the course of making it. There are two parts to creating a reprex:\n", + "\n", + "- First, you need to make your code reproducible. This means that you need to capture everything, i.e., include any packages you used and create all necessary objects. The easiest way to make sure you've done this is to use the [**watermark**](https://github.com/rasbt/watermark) package alongside whatever else you are doing:" + ], + "id": "fa96eb6b" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import watermark.watermark as watermark\n", + "\n", + "print(watermark())\n", + "print(watermark(iversions=True, globals_=globals()))" + ], + "id": "4afe4b6e", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Second, you need to make it minimal. Strip away everything that is not directly related to your problem. This usually involves creating a much smaller and simpler Python object than the one you're facing in real life or even using built-in data.\n", + "\n", + "That sounds like a lot of work! And it can be, but it has a great payoff:\n", + "\n", + "- 80% of the time creating an excellent reprex reveals the source of your problem. It's amazing how often the process of writing up a self-contained and minimal example allows you to answer your own question.\n", + "\n", + "- The other 20% of time you will have captured the essence of your problem in a way that is easy for others to play with. This substantially improves your chances of getting help.\n", + "\n", + "There are several things you need to include to make your example reproducible: Python environment, required packages, data, and code.\n", + "\n", + "- **Python environment**--really just the Python version. This is covered by the first call to the **watermark** package.\n", + "\n", + "- **Packages** and their versions. These should be loaded at the top of the script, so it's easy to see which ones the example needs. By using **watermark** with the above configuration, you will also print the package versions. This is a good time to check that you're using the latest version of each package; it's possible you've discovered a bug that's been fixed since you installed or last updated the package.\n", + "\n", + "- **Data**: as others won't be able to easily download the data you're working with, it's often best to create a small amount of data from code that still have the same problem as you're finding with your actual data. Between **numpy** and **pandas**, it's quite easy to generate data from code; here's an example:" + ], + "id": "defc6d2f" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "df = pd.DataFrame(\n", + " data=np.reshape(range(36), (6, 6)),\n", + " index=[\"a\", \"b\", \"c\", \"d\", \"e\", \"f\"],\n", + " columns=[\"col\" + str(i) for i in range(6)],\n", + " dtype=float,\n", + ")\n", + "df[\"random_normal\"] = np.random.normal(size=6)\n", + "df" + ], + "id": "4d2de081", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- **Code**: copy and paste the minimal reproducible example code (including the packages, as noted above). Make sure you've used spaces and your variable names are concise, yet informative. Use comments to indicate where your problem lies. Do your best to remove everything that is not related to the problem. Finally, the shorter your code is, the easier it is to understand, and the easier it is to fix.\n", + "\n", + "Finish by checking that you have actually made a reproducible example by starting a fresh Python session and copying and pasting your reprex in." + ], + "id": "ad3edef5" + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "language": "python", + "display_name": "Python 3 (ipykernel)", + "path": "/Users/omagic/Documents/GitHub/python4DSpolars/.venv/share/jupyter/kernels/python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/workflow-packages-and-environments.quarto_ipynb_1 b/workflow-packages-and-environments.quarto_ipynb_1 new file mode 100644 index 0000000..ebd7b64 --- /dev/null +++ b/workflow-packages-and-environments.quarto_ipynb_1 @@ -0,0 +1,149 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Workflow: Packages and Environments {#sec-workflow-packages-and-environments}\n", + "\n", + "In this chapter, you're going to learn about packages and how to install them plus virtual coding environments that keep your packages isolated and your projects reproducible.\n", + "\n", + "## Packages\n", + "\n", + "### Introduction\n", + "\n", + "Packages (also called libraries) are key to extending the functionality of Python. It won't be long before you'll need to install some. There are packages for geoscience, for building websites, for analysing genetic data, for economics—pretty much for anything you can think of. Packages are typically not written by the core maintainers of the Python language but by enthusiasts, firms, researchers, academics, all sorts! Because anyone can write packages, they vary widely in their quality and usefulness. There are some that you'll be seeing them again and again.\n", + "\n", + "

Name a more iconic trio, I'll wait. pic.twitter.com/pGaLuUxQ3r

— Vicki Boykis (\\@vboykis) August 23, 2018
\n", + "\n", + "The three Python packages **numpy**, **pandas**, and **maplotlib**, which respectively provide numerical, data analysis, and plotting functionality, are ubiquitous. So many scripts begin by importing all three of them, as in the tweet above!\n", + "\n", + "There are typically two steps to using a new Python package:\n", + "\n", + "1. *install* the package on the command line (aka the terminal), eg using `uv add pandas`\n", + "\n", + "2. *import* the package into your Python session, eg using `import pandas as pd`\n", + "\n", + "When you issue an install command for a specific package, it is automatically downloaded from the internet and installed in the appropriate place on your computer. To install extra Python packages, you issue install commands to a text-based window called the \"terminal\".\n", + "\n", + "### The Command Line in Brief\n", + "\n", + "The *terminal* or *command line* or sometimes the *command prompt* was labelled 4 in the screenshot of Visual Studio Code from the chapter on @sec-introduction. The terminal is a text-based way to issue all kinds of commands to your computer (not just Python commands) and knowing a little bit about it is really useful for coding (and more) because managing packages, environments (which we haven't yet discussed), and version control (ditto) can all be done via the terminal. We'll come to these in due course in the chapter on @sec-command-line, but for now, a little background on what the terminal is and what it does.\n", + "\n", + "::: {.callout-note}\n", + "To open up the command line within Visual Studio Code, use the + \\` keyboard shortcut (Mac) or ctrl + \\` (Windows/Linux), or click \"View > Terminal\".\n", + "\n", + "If you want to open up the command line independently of Visual Studio Code, search for \"Terminal\" on Mac and Linux, and \"Powershell\" on Windows.\n", + ":::\n", + "\n", + "Firstly, everything you can do by clicking on icons to launch programmes on your computer, you can also do via the terminal, also known as the command line. For many programmes, a lot of their functionality can be accessed using the command line, and other programmes *only* have a command line interface (CLI), including some that are used for data science.\n", + "\n", + "::: {.callout-tip}\n", + "The command line interacts with your operating system and is used to create, activate, or change Python installations.\n", + ":::\n", + "\n", + "Use Visual Studio Code to open a terminal window by clicking Terminal -> New Terminal on the list of commands at the very top of the window. If you have installed uv on your computer, your terminal should look something like this as your 'command prompt':\n", + "\n", + "```bash\n", + "your-username@your-computer current-directory %\n", + "```\n", + "\n", + "on Mac, and the same but with '%' replaced by '$' on linux, and (using Powershell)\n", + "\n", + "```powershell\n", + "PS C:\\Windows\\System32>\n", + "```\n", + "\n", + "on Windows.\n", + "\n", + "You can check that uv has successfully installed Python in your current project's folder by running\n", + "\n", + "```bash\n", + "uv run python --version\n", + "```\n", + "\n", + "For now, to at least try out the command line, let's use something that works across all three of the major operating systems. Type `uv run python` on the command prompt that came up in your new terminal window. You should see information about your installation of Python appear, including the version, followed by a Python prompt that looks like `>>>`. This is a kind of interactive Python session, in the terminal. It's much less rich than the one available in Visual Studio Code (it can't run scripts line-by-line, for example) but you can try `print('Hello World!')` and it will run, printing your message. To exit the terminal-based Python session, type `exit()` to go back to the regular command line.\n", + "\n", + "### Installing Packages\n", + "\n", + "To install extra Python packages, the default and easiest way is to use `uv add **packagename**`. There are over 330,000 Python packages on PyPI (the Python Package Index)! You can see what packages you have installed already by running `uv pip list` into the command line.\n", + "\n", + "`uv add ...` will install packages into the special Python environment in your current folder (it sits in a subdirectory called \".venv\" which will be hidden by default on most systems.) It's really helpful and good practice to have one Python environment per project, and **uv** does this automatically for you.\n", + "\n", + "::: {.callout-tip title=\"Exercise\"}\n", + "Try installing the **matplotlib**, **pandas**, **statsmodels**, and **skimpy** packages using `uv add`.\n", + ":::\n", + "\n", + "### Using Packages\n", + "\n", + "Once you have installed a package, you need to be able to use it! This is usually done via an import statement at the top of your script or Jupyter Notebook. For example, to bring in **pandas**, it's\n", + "\n", + "```python\n", + "import pandas as pd\n", + "```\n", + "\n", + "Why does Python do this? The idea of not just loading every package is to provide clarity over what function is being called from what package. It's also not necessary to load every package for every piece of analysis, and you often actually want to know what the *minimum* set of packages is to reproduce an analysis. Making the package imports explicit helps with all of that.\n", + "\n", + "You may also wonder why one doesn't just use `import pandas as pandas`. There's actually nothing stopping you doing this except i) it's convenient to have a shorter name and ii) there does tend to be a convention around imports, ie `pd` for **pandas** and `np` for **numpy**, and your code will be clearer to yourself and others if you follow the conventions.\n", + "\n", + "## Virtual Code Environments\n", + "\n", + "Virtual code environments allow you to isolate all of the packages that you're using to do analysis for one project from the set of packages you might need for a different project. They're an important part of creating a reproducible analytical pipeline but a key benefit is that others can reproduce the environment you used and it's best practice to have an isolated environment per project.\n", + "\n", + "To be more concrete, let's say you're using Python 3.9, **statsmodels**, and **pandas** for one project, project A. And, for project B, you need to use Python 3.10 with **numpy** and **scikit-learn**. Even with the same version of Python, best practice would be to have two separate virtual Python environments: environment A, with everything needed for project A, and environment B, with everything needed for project B. For the case where you're using different versions of Python, this isn't just best practice, it's essential.\n", + "\n", + "Many programming languages now come with an option to install packages and a version of the language in isolated environments. In Python, there are multiple tools for managing different environments. And, of those, the easiest to work with is probably [**uv**](https://docs.astral.sh/uv/).\n", + "\n", + "You can see all of the packages in the environment created in your current folder by running `uv pip list` on the command line. Here's an example of looking at the installed packages within this very book, filtering them just to the ones beginning with \"s\".\n", + "\n", + "```{bash}\n", + "uv run pip list | grep ^s\n", + "```\n", + "\n", + "### The pyproject.toml file in Python Environments\n", + "\n", + "You may have noticed that a file called `pyproject.toml` has been created." + ], + "id": "8b429c82" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "import toml\n", + "from rich import print_json\n", + "\n", + "print_json(data=toml.load(\"pyproject.toml\"))" + ], + "id": "a099d93a", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This lists all of the dependencies, and the version, of a **uv** Python project. There are lots of benefits to tracking what versions of packages you're using like this. One of the most important is that you can *share* projects with other people, and they can install them from these files too.\n", + "\n", + "As you install or remove packages, the `pyproject.toml` file changes in lockstep.\n", + "\n", + "Noe that Visual Studio Code shows which Python environment you are using when you open a Python script or Jupyter Notebook.\n", + "\n", + "![A typical user view in Visual Studio Code](https://github.com/aeturrell/coding-for-economists/blob/main/img/vscode_layout.png?raw=true)\n", + "\n", + "In the screenshot above, you can see the project-environment in two places: on the blue bar at the bottom of the screen, and (in 5), at the top right hand side of the interactive window. A similar top right indicator is present when you have a Jupyter Notebook open too." + ], + "id": "08e648b4" + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "language": "python", + "display_name": "Python 3 (ipykernel)", + "path": "/Users/omagic/Documents/GitHub/python4DSpolars/.venv/share/jupyter/kernels/python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file From dde11c9f1270fe976df291316261b3cb6d9a6c82 Mon Sep 17 00:00:00 2001 From: Nwabueze Ugoh Date: Tue, 28 Apr 2026 12:34:24 +0100 Subject: [PATCH 4/4] Update command-line.md for consistent formatting and clarity by replacing asterisks with underscores for emphasis. Adjust iteration.ipynb to correct execution counts and enhance output displays. Modify visualisation.quarto_ipynb_1 and workflow-help.quarto_ipynb_1 to update cell IDs for better tracking. Refactor workflow-packages-and-environments.quarto_ipynb_1 to ensure consistent ID formatting across cells. --- command-line.md | 104 +++++++------- iteration.ipynb | 135 +++++++++--------- visualise.quarto_ipynb_1 | 6 +- workflow-help.quarto_ipynb_1 | 10 +- ...w-packages-and-environments.quarto_ipynb_1 | 6 +- 5 files changed, 127 insertions(+), 134 deletions(-) diff --git a/command-line.md b/command-line.md index 1079842..2cdce9b 100644 --- a/command-line.md +++ b/command-line.md @@ -1,6 +1,6 @@ # The Command Line {#sec-command-line} -In this chapter, you'll meet the *command line* and learn how to use it. Beyond a few key commands like `uv add ` you don't strictly need to know how to use the command line to follow the rest of this book. However, even a tiny bit of knowledge of the command line goes a long way in coding and will serve you well. +In this chapter, you'll meet the _command line_ and learn how to use it. Beyond a few key commands like `uv add ` you don't strictly need to know how to use the command line to follow the rest of this book. However, even a tiny bit of knowledge of the command line goes a long way in coding and will serve you well. To try out any of the commands in this chapter on your machine, you can select 'New Terminal' from the menu bar in Visual Studio Code (Mac and Linux), use the Windows Subsystem for Linux or git bash (Windows), or use a free [online terminal](https://cocalc.com/doc/terminal.html). @@ -8,17 +8,17 @@ This chapter has benefited from numerous sources, including absolutely excellent ## What is the command line? -The command line is a way to directly issue text-based commands to a computer one line at a time (as distinct from a graphical user interface, or GUI, that you navigate with a mouse). It goes under many names: shell, bash, terminal, CLI, and command line. These are actually different things but most people tend to use them to mean the same thing most of the time. The *shell* is the part of an operating system that you interact with but mostly people use shell to mean the command line. *bash* is the programming language that is used in the command line; it's actually a synonym for 'Born Again SHell'. The *terminal* is sometimes used to refer to the command line on Macs. Finally, a *CLI* is just an acronym for command line interface, and is often used in the context of an application; for example, uv has a command line interface because you run it on the command line to install packages (`uv add packagename`). +The command line is a way to directly issue text-based commands to a computer one line at a time (as distinct from a graphical user interface, or GUI, that you navigate with a mouse). It goes under many names: shell, bash, terminal, CLI, and command line. These are actually different things but most people tend to use them to mean the same thing most of the time. The _shell_ is the part of an operating system that you interact with but mostly people use shell to mean the command line. _bash_ is the programming language that is used in the command line; it's actually a synonym for 'Born Again SHell'. The _terminal_ is sometimes used to refer to the command line on Macs. Finally, a _CLI_ is just an acronym for command line interface, and is often used in the context of an application; for example, uv has a command line interface because you run it on the command line to install packages (`uv add packagename`). It's worth mentioning that there's a big difference between the command line on UNIX based systems (MacOS and Linux), and on Windows systems. Here, we'll only address the UNIX version. There is a command line on Windows but it's not widely used for coding. If you're on a Windows machine, you can access a UNIX command line using the Windows Subsystem for Linux. ## Why is the command line useful? -The command line has many uses. Graphical user interfaces are, generally, a bit easier to use *but* they're not very repeatable or scalable. Because the command line uses text-based instructions and can be programmed, it is both repeatable and scalable; properties that are very useful for research and analysis. +The command line has many uses. Graphical user interfaces are, generally, a bit easier to use _but_ they're not very repeatable or scalable. Because the command line uses text-based instructions and can be programmed, it is both repeatable and scalable; properties that are very useful for research and analysis. The broad reasons you might use the command line to issue instructions include: -- software functionality: some software *only* has a command line interface +- software functionality: some software _only_ has a command line interface - efficiency: your computer has limited memory, which graphical user interfaces use a lot of—the command line uses less @@ -71,7 +71,7 @@ The flags or options, such as `-n` in the example above, typically begin with a Spaces take on a special role when using the command line. For this reason, it's good practice to avoid spaces in file names. If you need to refer to a filename with spaces in, you’ll need to use quotes or escape the spaces in the file names using a `\`, for example `this is my file.txt` becomes `this\ is\ my\ file.txt` ::: -To run programmes from the command line, all you need is the name of the programme as the command: in fact, commands *are* programmes. The `date` command refers to an actual programme on your computer that you can find. And this also explains a bit of what's going on when you *run a script from the command line* (more on that later). +To run programmes from the command line, all you need is the name of the programme as the command: in fact, commands _are_ programmes. The `date` command refers to an actual programme on your computer that you can find. And this also explains a bit of what's going on when you _run a script from the command line_ (more on that later). Once you've run a few commands, you'll notice that you can't navigate around the command line like you can a text file or Python script. Here are some tips for navigating the command line: @@ -93,20 +93,20 @@ Once you've run a few commands, you'll notice that you can't navigate around the ### Navigating directories -While we're on navigating, it's useful to understand *where* in the computer you are when you open the command line. If you open a terminal pane within VS Code, you will start (by default at least) within the same folder as your project. Starting a terminal instance outside of VS Code will get you a terminal in a root directory for your computer; for example, on a Mac, opening a new terminal window starts you in `/Users/yourusername/`. +While we're on navigating, it's useful to understand _where_ in the computer you are when you open the command line. If you open a terminal pane within VS Code, you will start (by default at least) within the same folder as your project. Starting a terminal instance outside of VS Code will get you a terminal in a root directory for your computer; for example, on a Mac, opening a new terminal window starts you in `/Users/yourusername/`. To find out "where" you are when you open a terminal, you can use the `pwd` command, which stands for "print working directory". -The table below shows some useful commands for moving around your computer using the command line. Note that `cd` accepts a location *relative* to your current directory. +The table below shows some useful commands for moving around your computer using the command line. Note that `cd` accepts a location _relative_ to your current directory. - | Command | What it does | - | --------------------- | ------------------------------------------------------------ | - | `pwd` | Shows current directory | - | `cd` | Change directory command | - | `cd ..` | Go up one level in the directory (`cd ../..` for two levels) | - | `cd ~` | Go to your home directory | - | `cd -` | Go to the previous directory | - | `cd documents/papers` | Go directly to a directory named 'papers' | +| Command | What it does | +| --------------------- | ------------------------------------------------------------ | +| `pwd` | Shows current directory | +| `cd` | Change directory command | +| `cd ..` | Go up one level in the directory (`cd ../..` for two levels) | +| `cd ~` | Go to your home directory | +| `cd -` | Go to the previous directory | +| `cd documents/papers` | Go directly to a directory named 'papers' | ## Using Python on the command line @@ -124,7 +124,7 @@ Say you have a script called `analysis.py`, you can run it with Python on the co uv run python analysis.py ``` -which calls Python as a programme and gives it `analysis.py` as the argument. If you have multiple versions of Python, which you should do if you're following best practice and using a version per project, then you can see *which* version of Python is being used with +which calls Python as a programme and gives it `analysis.py` as the argument. If you have multiple versions of Python, which you should do if you're following best practice and using a version per project, then you can see _which_ version of Python is being used with ```bash which python @@ -134,39 +134,39 @@ which python Now we'll see some useful commands for the terminal. - | Command                                          | What it does | - | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------ | - | `man ` | Shows a manual for the given command | - | `touch ` | Creates an empty file named `` | - | `code ` | Open a file in VS Code (creating it, if it does not exist) | - | `mkdir ` | creates a new folder called `foldername` | - | `echo ` | Prints `` | - | `cat ` | Print the full contents of `` | - | `head ` | Print the start of a file | - | `tail ` | Print the end of a file | - | `> ` | Redirects output from screen to ``. For example, `echo "Hello World" > hello.txt` | - | `>> ` | Redirects output from screen to the end of ``, ie appends output rather than overwrites it | - | ` | ` | The pipe symbol: uses output from one command as input into another. For example, `head -n 10 data.csv | > hello_world.txt` would write the first 10 lines of data.csv into a file called hello_world.txt | - | `less ` | Print out the contents of a file in paginated form. Use `ctrl+v` and `Alt+v` (or `⌘+v` and `⌥+v` on Mac) to move up and down. Press `q` to quit. | - | `wc -l` | Returns number of lines in input, for example `cat | wc -l`. Use `wc` alone for word count. | - | `sort` | Arrange lines in a file in alphabetical order | - | `uniq` | Remove duplicate lines from input, for example `cat | uniq` or `uniq -d` to show duplicate files | - | `mv` | Move or rename a file; for example, `mv file1 file2` would rename `file1` to `file2` while `mv file1 ~` would move `file1` to the home directory | - | `cp` | Copy a file; for example, `cp file1 file2` would copy `file1` to `file2` while `cp file1 ~` would make a copy of `file1` in the home directory | - | `rm ` | Permanently remove a file | - | `rmdir ` | Permanently remove an empty directory | - | `rm -rf ` | ⚠ Permanently remove everything in a directory ⚠ | - | `grep ` | Search for a given term, for example `cat hello_world.txt | grep world` | - | `ls` | Basically, this means list stuff (files and folders) in the current directory | - | `ls -a` | List stuff in the current directory even if it's hidden | - | `ls -l` | List stuff in a more readable format and show permissions | - | `ls -S` | List stuff by size | - | `file ` | Give information on the file type of `` | - | `find` | Find specific files on your computer, can be piped into other commands for example `find *.md -size +5k -type f | xargs wc -l` will count the number of lines `wc -l` of all files, `-type f`, ending in `.md` that are greater than 5 kilobytes in size, `-size +5k`. | - | `diff -u ` | Show a single summary of the differences between two files. | +| Command                                          | What it does | +| --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------ | +| `man ` | Shows a manual for the given command | +| `touch ` | Creates an empty file named `` | +| `code ` | Open a file in VS Code (creating it, if it does not exist) | +| `mkdir ` | creates a new folder called `foldername` | +| `echo ` | Prints `` | +| `cat ` | Print the full contents of `` | +| `head ` | Print the start of a file | +| `tail ` | Print the end of a file | +| `> ` | Redirects output from screen to ``. For example, `echo "Hello World" > hello.txt` | +| `>> ` | Redirects output from screen to the end of ``, ie appends output rather than overwrites it | +| ` | ` | The pipe symbol: uses output from one command as input into another. For example, `head -n 10 data.csv | > hello_world.txt` would write the first 10 lines of data.csv into a file called hello_world.txt | +| `less ` | Print out the contents of a file in paginated form. Use `ctrl+v` and `Alt+v` (or `⌘+v` and `⌥+v` on Mac) to move up and down. Press `q` to quit. | +| `wc -l` | Returns number of lines in input, for example `cat | wc -l`. Use `wc` alone for word count. | +| `sort` | Arrange lines in a file in alphabetical order | +| `uniq` | Remove duplicate lines from input, for example `cat | uniq`or`uniq -d` to show duplicate files | +| `mv` | Move or rename a file; for example, `mv file1 file2` would rename `file1` to `file2` while `mv file1 ~` would move `file1` to the home directory | +| `cp` | Copy a file; for example, `cp file1 file2` would copy `file1` to `file2` while `cp file1 ~` would make a copy of `file1` in the home directory | +| `rm ` | Permanently remove a file | +| `rmdir ` | Permanently remove an empty directory | +| `rm -rf ` | ⚠ Permanently remove everything in a directory ⚠ | +| `grep ` | Search for a given term, for example `cat hello_world.txt | grep world` | +| `ls` | Basically, this means list stuff (files and folders) in the current directory | +| `ls -a` | List stuff in the current directory even if it's hidden | +| `ls -l` | List stuff in a more readable format and show permissions | +| `ls -S` | List stuff by size | +| `file ` | Give information on the file type of `` | +| `find` | Find specific files on your computer, can be piped into other commands for example `find \*.md -size +5k -type f | xargs wc -l`will count the number of lines`wc -l`of all files,`-type f`, ending in `.md`that are greater than 5 kilobytes in size,`-size +5k`. | +| `diff -u ` | Show a single summary of the differences between two files. | ![More details of the grep command](https://pbs.twimg.com/media/DcPeD_CW0AEkSar?format=jpg&name=small) -*More details of the grep command, by [\@b0rk](https://twitter.com/b0rk).* +_More details of the grep command, by [\@b0rk](https://twitter.com/b0rk)._ You can write for loops in bash (remember, it's a language). The general structure is @@ -187,7 +187,7 @@ A more interesting example is giving the number of lines of text, number of word ```bash for i in $(ls *.csv) -do +do wc $i done ``` @@ -204,7 +204,7 @@ done A couple of new features appeared in the examples above. -`*` is a *wildcard character*, it tells bash to look for anything that ends in ".csv". This is not the only special case; `?` serves a similar purpose of standing in for any character but just *one* character rather than arbitrarily many. If you had a folder with `file1.csv`, `file2.csv`, etc., up to 9, then you could use `file?.csv` to refer to all of them but this would not pick up `file10.csv`. +`*` is a _wildcard character_, it tells bash to look for anything that ends in ".csv". This is not the only special case; `?` serves a similar purpose of standing in for any character but just _one_ character rather than arbitrarily many. If you had a folder with `file1.csv`, `file2.csv`, etc., up to 9, then you could use `file?.csv` to refer to all of them but this would not pick up `file10.csv`. Another special character we've already seen is the curly brace, `{}`. Whenever you have a common substring in a series of commands using curly braces tells the command line to expand what's in them automatically. In an example above, this is used on 1 to 5. But it can also be used in, for example, file names: @@ -272,7 +272,7 @@ You can find more of these special variables [here](https://tldp.org/LDP/abs/htm [**pandoc**](https://pandoc.org/) is absolutely brilliant: if you need to convert files containing text from one format to another, it really is a swiss-army knife. There isn't space here to list the ridiculous number of documents it can convert between, but, importantly, it can translate back and forth between all of the following: markdown, $\LaTeX$, Microsoft Word's docx, OpenOffice's ODT, HTML, and Jupyter Notebook. -It can also write from any of those formats (and more) in one direction *to* PDF, Microsoft Powerpoint, and $\LaTeX$ Beamer. +It can also write from any of those formats (and more) in one direction _to_ PDF, Microsoft Powerpoint, and $\LaTeX$ Beamer. To use **pandoc**, install it following the instructions on the website and then call it like this: @@ -284,9 +284,9 @@ This is an example where the input is a .tex document and the output, `-o`, is a You can get quite fancy with **pandoc**, for example you can translate a whole book's worth of latex into a Word doc complete with a Word style, a bibliography via biblatex, equations, and figures. Nothing can save Word from being painful to use, but **pandoc** certainly helps. -[**eza**](https://eza.rocks/) is an upgrade on the `ls` command. It is designed to be an improved file lister with more features and better defaults. It uses colours to distinguish file types and metadata. Follow the instructions on the website to install it on your operating system. To replace `ls` with `eza`, you can use a terminal *alias*. There's a good guide [available here](https://denisrasulev.medium.com/eza-the-best-ls-command-replacement-9621252323e). +[**eza**](https://eza.rocks/) is an upgrade on the `ls` command. It is designed to be an improved file lister with more features and better defaults. It uses colours to distinguish file types and metadata. Follow the instructions on the website to install it on your operating system. To replace `ls` with `eza`, you can use a terminal _alias_. There's a good guide [available here](https://denisrasulev.medium.com/eza-the-best-ls-command-replacement-9621252323e). -**nano** is a built-in text editor that runs *within* the terminal. This can be really useful if you're working on the cloud (but it's not got the rich features of a GUI-based text editor like VS Code). To open a file using **nano**, the command is `nano file.txt`. Nano displays instructions on how to navigate when it loads up but exiting is the hardest part: when you're done, hit `Ctrl+X`, then `y` to save, and then `enter` to exit. +**nano** is a built-in text editor that runs _within_ the terminal. This can be really useful if you're working on the cloud (but it's not got the rich features of a GUI-based text editor like VS Code). To open a file using **nano**, the command is `nano file.txt`. Nano displays instructions on how to navigate when it loads up but exiting is the hardest part: when you're done, hit `Ctrl+X`, then `y` to save, and then `enter` to exit. [**wget**](https://www.gnu.org/software/wget/) is a command-line utility for downloading files from the internet. It's very simple to use, the syntax is just `wget [options] [url]`. For example, to download the starwars csv file used in this book, the command is diff --git a/iteration.ipynb b/iteration.ipynb index 5975665..3db7c82 100644 --- a/iteration.ipynb +++ b/iteration.ipynb @@ -348,7 +348,7 @@ "[51, 52, 53, 54, 55, 56, 57, 58, 59, 60]" ] }, - "execution_count": 32, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -536,7 +536,7 @@ "{'Ada': 'Lovelace', 'Adam': 'Smith'}" ] }, - "execution_count": 38, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -651,7 +651,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (6, 4)
abcd
f64f64f64f64
-0.274564-0.7875621.4215290.502933
0.3290070.470936-1.4572540.682447
-0.5765030.492678-0.6358111.311106
0.5108971.906728-1.4333840.310135
0.486463-1.1124421.686876-1.668481
0.6703620.81884-0.7804550.777569
" + "shape: (6, 4)
abcd
f64f64f64f64
-0.1032560.546434-1.0070280.016913
-0.377048-0.2567130.913368-1.584054
-0.0096441.126097-0.4014050.332419
1.551129-0.505919-0.2170860.348191
0.7055770.826090.4458931.21817
0.3426031.178953-0.0046820.546398
" ], "text/plain": [ "shape: (6, 4)\n", @@ -660,23 +660,23 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪═══════════╪═══════════╪═══════════╡\n", - "│ -0.274564 ┆ -0.787562 ┆ 1.421529 ┆ 0.502933 │\n", - "│ 0.329007 ┆ 0.470936 ┆ -1.457254 ┆ 0.682447 │\n", - "│ -0.576503 ┆ 0.492678 ┆ -0.635811 ┆ 1.311106 │\n", - "│ 0.510897 ┆ 1.906728 ┆ -1.433384 ┆ 0.310135 │\n", - "│ 0.486463 ┆ -1.112442 ┆ 1.686876 ┆ -1.668481 │\n", - "│ 0.670362 ┆ 0.81884 ┆ -0.780455 ┆ 0.777569 │\n", + "│ -0.103256 ┆ 0.546434 ┆ -1.007028 ┆ 0.016913 │\n", + "│ -0.377048 ┆ -0.256713 ┆ 0.913368 ┆ -1.584054 │\n", + "│ -0.009644 ┆ 1.126097 ┆ -0.401405 ┆ 0.332419 │\n", + "│ 1.551129 ┆ -0.505919 ┆ -0.217086 ┆ 0.348191 │\n", + "│ 0.705577 ┆ 0.82609 ┆ 0.445893 ┆ 1.21817 │\n", + "│ 0.342603 ┆ 1.178953 ┆ -0.004682 ┆ 0.546398 │\n", "└───────────┴───────────┴───────────┴───────────┘" ] }, - "execution_count": 40, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import polars as pl\n", "import numpy as np\n", + "import polars as pl\n", "\n", "df = pl.DataFrame(np.random.normal(size=(6, 4)), schema=[\"a\", \"b\", \"c\", \"d\"])\n", "df" @@ -706,20 +706,20 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 4)
abcd
f64f64f64f64
0.4077350.481807-0.7081330.59269
" + "shape: (1, 4)
abcd
f64f64f64f64
0.1664790.686262-0.1108840.340305
" ], "text/plain": [ "shape: (1, 4)\n", - "┌──────────┬──────────┬───────────┬─────────┐\n", - "│ a ┆ b ┆ c ┆ d │\n", - "│ --- ┆ --- ┆ --- ┆ --- │\n", - "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", - "╞══════════╪══════════╪═══════════╪═════════╡\n", - "│ 0.407735 ┆ 0.481807 ┆ -0.708133 ┆ 0.59269 │\n", - "└──────────┴──────────┴───────────┴─────────┘" + "┌──────────┬──────────┬───────────┬──────────┐\n", + "│ a ┆ b ┆ c ┆ d │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════╪══════════╪═══════════╪══════════╡\n", + "│ 0.166479 ┆ 0.686262 ┆ -0.110884 ┆ 0.340305 │\n", + "└──────────┴──────────┴───────────┴──────────┘" ] }, - "execution_count": 41, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -744,7 +744,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (6, 1)
row_median
f64
0.114185
0.399972
-0.041912
0.410516
-0.31299
0.723965
" + "shape: (6, 1)
row_median
f64
-0.043172
-0.316881
0.161388
0.065552
0.765833
0.4445
" ], "text/plain": [ "shape: (6, 1)\n", @@ -753,24 +753,22 @@ "│ --- │\n", "│ f64 │\n", "╞════════════╡\n", - "│ 0.114185 │\n", - "│ 0.399972 │\n", - "│ -0.041912 │\n", - "│ 0.410516 │\n", - "│ -0.31299 │\n", - "│ 0.723965 │\n", + "│ -0.043172 │\n", + "│ -0.316881 │\n", + "│ 0.161388 │\n", + "│ 0.065552 │\n", + "│ 0.765833 │\n", + "│ 0.4445 │\n", "└────────────┘" ] }, - "execution_count": 42, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.select(\n", - " pl.concat_list(pl.all()).list.median().alias(\"row_median\")\n", - ")" + "df.select(pl.concat_list(pl.all()).list.median().alias(\"row_median\"))" ] }, { @@ -791,18 +789,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "247 μs ± 1.65 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + "246 μs ± 2.46 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], "source": [ "# Do not do this!\n", "\n", + "\n", "def add_five_slow(df):\n", " for i in range(len(df)):\n", " for j in range(len(df.columns)):\n", " df[i, j] = df[i, j] + 5\n", "\n", + "\n", "%timeit add_five_slow(df)" ] }, @@ -824,7 +824,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "52.8 μs ± 1.79 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" + "51.1 μs ± 395 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" ] } ], @@ -857,7 +857,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = df.with_columns(new_a = pl.col(\"a\") + 5)" + "df = df.with_columns(new_a=pl.col(\"a\") + 5)" ] }, { @@ -888,7 +888,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (6, 6)
abcdnew_aresult
f64f64f64f64f64f64
40554.72543640554.21243840556.42152940555.50293340559.725436-7.674924
40555.32900740555.47093640553.54274640555.68244740560.329007-2.933522
40554.42349740555.49267840554.36418940556.31110640559.423497-4.638827
40555.51089740556.90672840553.56661640555.31013540560.510897-1.339664
40555.48646340553.88755840556.68687640553.33151940560.486463-7.504234
40555.67036240555.8188440554.21954540555.77756940560.670362-2.921115
" + "shape: (6, 6)
abcdnew_aresult
f64f64f64f64f64f64
40554.89674440555.54643440553.99297240555.01691340559.896744-3.90117
40554.62295240554.74328740555.91336840553.41594640559.622952-6.898851
40554.99035640556.12609740554.59859540555.33241940559.990356-3.833543
40556.55112940554.49408140554.78291440555.34819140561.551129-4.089305
40555.70557740555.8260940555.44589340556.2181740560.705577-4.265744
40555.34260340556.17895340554.99531840555.54639840560.342603-3.8252
" ], "text/plain": [ "shape: (6, 6)\n", @@ -897,16 +897,16 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════════╪══════════════╪══════════════╪══════════════╪══════════════╪═══════════╡\n", - "│ 40554.725436 ┆ 40554.212438 ┆ 40556.421529 ┆ 40555.502933 ┆ 40559.725436 ┆ -7.674924 │\n", - "│ 40555.329007 ┆ 40555.470936 ┆ 40553.542746 ┆ 40555.682447 ┆ 40560.329007 ┆ -2.933522 │\n", - "│ 40554.423497 ┆ 40555.492678 ┆ 40554.364189 ┆ 40556.311106 ┆ 40559.423497 ┆ -4.638827 │\n", - "│ 40555.510897 ┆ 40556.906728 ┆ 40553.566616 ┆ 40555.310135 ┆ 40560.510897 ┆ -1.339664 │\n", - "│ 40555.486463 ┆ 40553.887558 ┆ 40556.686876 ┆ 40553.331519 ┆ 40560.486463 ┆ -7.504234 │\n", - "│ 40555.670362 ┆ 40555.81884 ┆ 40554.219545 ┆ 40555.777569 ┆ 40560.670362 ┆ -2.921115 │\n", + "│ 40554.896744 ┆ 40555.546434 ┆ 40553.992972 ┆ 40555.016913 ┆ 40559.896744 ┆ -3.90117 │\n", + "│ 40554.622952 ┆ 40554.743287 ┆ 40555.913368 ┆ 40553.415946 ┆ 40559.622952 ┆ -6.898851 │\n", + "│ 40554.990356 ┆ 40556.126097 ┆ 40554.598595 ┆ 40555.332419 ┆ 40559.990356 ┆ -3.833543 │\n", + "│ 40556.551129 ┆ 40554.494081 ┆ 40554.782914 ┆ 40555.348191 ┆ 40561.551129 ┆ -4.089305 │\n", + "│ 40555.705577 ┆ 40555.82609 ┆ 40555.445893 ┆ 40556.21817 ┆ 40560.705577 ┆ -4.265744 │\n", + "│ 40555.342603 ┆ 40556.178953 ┆ 40554.995318 ┆ 40555.546398 ┆ 40560.342603 ┆ -3.8252 │\n", "└──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴───────────┘" ] }, - "execution_count": 46, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -915,16 +915,13 @@ "# Don't do this (slow, row-wise)\n", "mean_new_a = df.select(pl.col(\"new_a\").mean()).item()\n", "df.with_columns(\n", - " result = pl.struct([\"a\", \"b\", \"c\"]).map_elements(\n", - " lambda x: x[\"a\"] - mean_new_a * x[\"c\"] / x[\"b\"],\n", - " return_dtype=pl.Float64\n", + " result=pl.struct([\"a\", \"b\", \"c\"]).map_elements(\n", + " lambda x: x[\"a\"] - mean_new_a * x[\"c\"] / x[\"b\"], return_dtype=pl.Float64\n", " )\n", ")\n", "\n", "# Do this instead (fast, vectorized)\n", - "df.with_columns(\n", - " result = pl.col(\"a\") - pl.col(\"new_a\").mean() * pl.col(\"c\") / pl.col(\"b\")\n", - ")" + "df.with_columns(result=pl.col(\"a\") - pl.col(\"new_a\").mean() * pl.col(\"c\") / pl.col(\"b\"))" ] }, { @@ -959,7 +956,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (6, 6)
abcdnew_aratio
f64f64f64f64f64f64
40554.72543640554.21243840556.42152940555.50293340559.7254360.999877
40555.32900740555.47093640553.54274640555.68244740560.3290070.999877
40554.42349740555.49267840554.36418940556.31110640559.4234970.999877
40555.51089740556.90672840553.56661640555.31013540560.5108970.999877
40555.48646340553.88755840556.68687640553.33151940560.4864630.999877
40555.67036240555.8188440554.21954540555.77756940560.6703620.999877
" + "shape: (6, 6)
abcdnew_aratio
f64f64f64f64f64f64
40554.89674440555.54643440553.99297240555.01691340559.8967440.999877
40554.62295240554.74328740555.91336840553.41594640559.6229520.999877
40554.99035640556.12609740554.59859540555.33241940559.9903560.999877
40556.55112940554.49408140554.78291440555.34819140561.5511290.999877
40555.70557740555.8260940555.44589340556.2181740560.7055770.999877
40555.34260340556.17895340554.99531840555.54639840560.3426030.999877
" ], "text/plain": [ "shape: (6, 6)\n", @@ -968,24 +965,22 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════════╪══════════════╪══════════════╪══════════════╪══════════════╪══════════╡\n", - "│ 40554.725436 ┆ 40554.212438 ┆ 40556.421529 ┆ 40555.502933 ┆ 40559.725436 ┆ 0.999877 │\n", - "│ 40555.329007 ┆ 40555.470936 ┆ 40553.542746 ┆ 40555.682447 ┆ 40560.329007 ┆ 0.999877 │\n", - "│ 40554.423497 ┆ 40555.492678 ┆ 40554.364189 ┆ 40556.311106 ┆ 40559.423497 ┆ 0.999877 │\n", - "│ 40555.510897 ┆ 40556.906728 ┆ 40553.566616 ┆ 40555.310135 ┆ 40560.510897 ┆ 0.999877 │\n", - "│ 40555.486463 ┆ 40553.887558 ┆ 40556.686876 ┆ 40553.331519 ┆ 40560.486463 ┆ 0.999877 │\n", - "│ 40555.670362 ┆ 40555.81884 ┆ 40554.219545 ┆ 40555.777569 ┆ 40560.670362 ┆ 0.999877 │\n", + "│ 40554.896744 ┆ 40555.546434 ┆ 40553.992972 ┆ 40555.016913 ┆ 40559.896744 ┆ 0.999877 │\n", + "│ 40554.622952 ┆ 40554.743287 ┆ 40555.913368 ┆ 40553.415946 ┆ 40559.622952 ┆ 0.999877 │\n", + "│ 40554.990356 ┆ 40556.126097 ┆ 40554.598595 ┆ 40555.332419 ┆ 40559.990356 ┆ 0.999877 │\n", + "│ 40556.551129 ┆ 40554.494081 ┆ 40554.782914 ┆ 40555.348191 ┆ 40561.551129 ┆ 0.999877 │\n", + "│ 40555.705577 ┆ 40555.82609 ┆ 40555.445893 ┆ 40556.21817 ┆ 40560.705577 ┆ 0.999877 │\n", + "│ 40555.342603 ┆ 40556.178953 ┆ 40554.995318 ┆ 40555.546398 ┆ 40560.342603 ┆ 0.999877 │\n", "└──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴──────────┘" ] }, - "execution_count": 47, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = df.with_columns(\n", - " (pl.col(\"a\") / pl.col(\"new_a\")).alias(\"ratio\")\n", - ")\n", + "df = df.with_columns((pl.col(\"a\") / pl.col(\"new_a\")).alias(\"ratio\"))\n", "df" ] }, @@ -1013,7 +1008,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (6, 7)
abcdnew_aratioa_gt_0.5
f64f64f64f64f64f64bool
40554.72543640554.21243840556.42152940555.50293340559.7254360.999877true
40555.32900740555.47093640553.54274640555.68244740560.3290070.999877true
40554.42349740555.49267840554.36418940556.31110640559.4234970.999877true
40555.51089740556.90672840553.56661640555.31013540560.5108970.999877true
40555.48646340553.88755840556.68687640553.33151940560.4864630.999877true
40555.67036240555.8188440554.21954540555.77756940560.6703620.999877true
" + "shape: (6, 7)
abcdnew_aratioa_gt_0.5
f64f64f64f64f64f64bool
40554.89674440555.54643440553.99297240555.01691340559.8967440.999877true
40554.62295240554.74328740555.91336840553.41594640559.6229520.999877true
40554.99035640556.12609740554.59859540555.33241940559.9903560.999877true
40556.55112940554.49408140554.78291440555.34819140561.5511290.999877true
40555.70557740555.8260940555.44589340556.2181740560.7055770.999877true
40555.34260340556.17895340554.99531840555.54639840560.3426030.999877true
" ], "text/plain": [ "shape: (6, 7)\n", @@ -1022,24 +1017,22 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ bool │\n", "╞══════════════╪══════════════╪══════════════╪══════════════╪══════════════╪══════════╪══════════╡\n", - "│ 40554.725436 ┆ 40554.212438 ┆ 40556.421529 ┆ 40555.502933 ┆ 40559.725436 ┆ 0.999877 ┆ true │\n", - "│ 40555.329007 ┆ 40555.470936 ┆ 40553.542746 ┆ 40555.682447 ┆ 40560.329007 ┆ 0.999877 ┆ true │\n", - "│ 40554.423497 ┆ 40555.492678 ┆ 40554.364189 ┆ 40556.311106 ┆ 40559.423497 ┆ 0.999877 ┆ true │\n", - "│ 40555.510897 ┆ 40556.906728 ┆ 40553.566616 ┆ 40555.310135 ┆ 40560.510897 ┆ 0.999877 ┆ true │\n", - "│ 40555.486463 ┆ 40553.887558 ┆ 40556.686876 ┆ 40553.331519 ┆ 40560.486463 ┆ 0.999877 ┆ true │\n", - "│ 40555.670362 ┆ 40555.81884 ┆ 40554.219545 ┆ 40555.777569 ┆ 40560.670362 ┆ 0.999877 ┆ true │\n", + "│ 40554.896744 ┆ 40555.546434 ┆ 40553.992972 ┆ 40555.016913 ┆ 40559.896744 ┆ 0.999877 ┆ true │\n", + "│ 40554.622952 ┆ 40554.743287 ┆ 40555.913368 ┆ 40553.415946 ┆ 40559.622952 ┆ 0.999877 ┆ true │\n", + "│ 40554.990356 ┆ 40556.126097 ┆ 40554.598595 ┆ 40555.332419 ┆ 40559.990356 ┆ 0.999877 ┆ true │\n", + "│ 40556.551129 ┆ 40554.494081 ┆ 40554.782914 ┆ 40555.348191 ┆ 40561.551129 ┆ 0.999877 ┆ true │\n", + "│ 40555.705577 ┆ 40555.82609 ┆ 40555.445893 ┆ 40556.21817 ┆ 40560.705577 ┆ 0.999877 ┆ true │\n", + "│ 40555.342603 ┆ 40556.178953 ┆ 40554.995318 ┆ 40555.546398 ┆ 40560.342603 ┆ 0.999877 ┆ true │\n", "└──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴──────────┴──────────┘" ] }, - "execution_count": 48, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = df.with_columns(\n", - " (pl.col(\"a\") > 0.5).alias(\"a_gt_0.5\")\n", - ")\n", + "df = df.with_columns((pl.col(\"a\") > 0.5).alias(\"a_gt_0.5\"))\n", "df" ] } @@ -1069,7 +1062,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.12" + "version": "3.12.13" }, "toc-showtags": true }, diff --git a/visualise.quarto_ipynb_1 b/visualise.quarto_ipynb_1 index ceccefe..c54f104 100644 --- a/visualise.quarto_ipynb_1 +++ b/visualise.quarto_ipynb_1 @@ -28,7 +28,7 @@ "\n", "The first of the three kinds is *exploratory data visualisation*, and it's the kind that you do when you're looking and data and trying to understand it. Just plotting the data is a really good strategy for getting a feel for any issues there might be. This is perhaps most famously demonstrated by Anscombe's quartet: four different datasets with the same mean, standard deviation, and correlation but very different data distributions." ], - "id": "e22704e5" + "id": "b9313f4b" }, { "cell_type": "code", @@ -96,7 +96,7 @@ "plt.suptitle(\"Anscombe's Quartet\")\n", "plt.show()" ], - "id": "87f336c9", + "id": "3de1a2d3", "execution_count": null, "outputs": [] }, @@ -120,7 +120,7 @@ "\n", "You can find more information on the topic of communicating via data visualisations in the @sec-communicate-plots chapter." ], - "id": "4a968c3b" + "id": "303b55a9" } ], "metadata": { diff --git a/workflow-help.quarto_ipynb_1 b/workflow-help.quarto_ipynb_1 index 18428db..8b23ba1 100644 --- a/workflow-help.quarto_ipynb_1 +++ b/workflow-help.quarto_ipynb_1 @@ -35,7 +35,7 @@ "\n", "- First, you need to make your code reproducible. This means that you need to capture everything, i.e., include any packages you used and create all necessary objects. The easiest way to make sure you've done this is to use the [**watermark**](https://github.com/rasbt/watermark) package alongside whatever else you are doing:" ], - "id": "fa96eb6b" + "id": "91169e03" }, { "cell_type": "code", @@ -48,7 +48,7 @@ "print(watermark())\n", "print(watermark(iversions=True, globals_=globals()))" ], - "id": "4afe4b6e", + "id": "bccd500b", "execution_count": null, "outputs": [] }, @@ -72,7 +72,7 @@ "\n", "- **Data**: as others won't be able to easily download the data you're working with, it's often best to create a small amount of data from code that still have the same problem as you're finding with your actual data. Between **numpy** and **pandas**, it's quite easy to generate data from code; here's an example:" ], - "id": "defc6d2f" + "id": "4000f40e" }, { "cell_type": "code", @@ -87,7 +87,7 @@ "df[\"random_normal\"] = np.random.normal(size=6)\n", "df" ], - "id": "4d2de081", + "id": "1187dba7", "execution_count": null, "outputs": [] }, @@ -99,7 +99,7 @@ "\n", "Finish by checking that you have actually made a reproducible example by starting a fresh Python session and copying and pasting your reprex in." ], - "id": "ad3edef5" + "id": "abea1503" } ], "metadata": { diff --git a/workflow-packages-and-environments.quarto_ipynb_1 b/workflow-packages-and-environments.quarto_ipynb_1 index ebd7b64..f61856c 100644 --- a/workflow-packages-and-environments.quarto_ipynb_1 +++ b/workflow-packages-and-environments.quarto_ipynb_1 @@ -104,7 +104,7 @@ "\n", "You may have noticed that a file called `pyproject.toml` has been created." ], - "id": "8b429c82" + "id": "58431611" }, { "cell_type": "code", @@ -115,7 +115,7 @@ "\n", "print_json(data=toml.load(\"pyproject.toml\"))" ], - "id": "a099d93a", + "id": "4bb6211b", "execution_count": null, "outputs": [] }, @@ -133,7 +133,7 @@ "\n", "In the screenshot above, you can see the project-environment in two places: on the blue bar at the bottom of the screen, and (in 5), at the top right hand side of the interactive window. A similar top right indicator is present when you have a Jupyter Notebook open too." ], - "id": "08e648b4" + "id": "76e4b027" } ], "metadata": {