diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 45cfc9cd7..909db0898 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -26,19 +26,35 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Write your code below.\n", - "\n" + "\n", + "%load_ext dotenv\n", + "%dotenv " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\NEWPC\\miniconda3\\envs\\dsi_participant\\lib\\site-packages\\dask\\dataframe\\__init__.py:42: FutureWarning: \n", + "Dask dataframe query planning is disabled because dask-expr is not installed.\n", + "\n", + "You can install it with `pip install dask[dataframe]` or `conda install dask`.\n", + "This will raise in a future version.\n", + "\n", + " warnings.warn(msg, FutureWarning)\n" + ] + } + ], "source": [ "import dask.dataframe as dd" ] @@ -58,12 +74,39 @@ "execution_count": 3, "metadata": {}, "outputs": [], + "source": [ + "import pandas as pd\n", + "import os\n", + "import sys\n", + "from glob import glob\n", + "\n", + "sys.path.append(os.getenv('PRICE_DATA'))\n", + "from utils.logger import get_logger \n", + "_logs = get_logger(__name__) " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-28 20:53:44,751, 366802870.py, 6, INFO, Reading price data from: ../../05_src/data/prices/\n" + ] + } + ], "source": [ "import os\n", "from glob import glob\n", "\n", "# Write your code below.\n", - "\n" + "PRICE_DATA = os.getenv(\"PRICE_DATA\")\n", + "_logs.info(f\"Reading price data from: {PRICE_DATA}\")\n", + "parquet_files = glob(os.path.join(PRICE_DATA, \"**/*.parquet\"), recursive=True)\n", + "dd_px = dd.read_parquet(parquet_files).set_index(\"ticker\")\n" ] }, { @@ -88,12 +131,338 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-28 20:55:00,064, 3115447876.py, 3, INFO, Starting feature engineering for each ticker\n", + "C:\\Users\\NEWPC\\AppData\\Local\\Temp\\ipykernel_19848\\3115447876.py:5: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.\n", + " Before: .apply(func)\n", + " After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n", + " or: .apply(func, meta=('x', 'f8')) for series result\n", + " dd_shift = dd_px.groupby('ticker', group_keys=False).apply(\n" + ] + } + ], "source": [ "# Write your code below.\n", - "\n" + "\n", + "_logs.info(\" Starting feature engineering for each ticker\")\n", + "\n", + "dd_shift = dd_px.groupby('ticker', group_keys=False).apply(\n", + " lambda x: x.assign(Close_lag_1 = x['Close'].shift(1),\n", + " Adj_Close_lag_1 = x['Adj Close'].shift(1))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "dd_rets = dd_shift.assign(\n", + " \n", + " hi_lo_range = lambda x: x['High'] - x['Low'],\n", + " Returns = lambda x: x['Close']/x['Close_lag_1'] - 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | Date | \n", + "Open | \n", + "High | \n", + "Low | \n", + "Close | \n", + "Adj Close | \n", + "Volume | \n", + "source | \n", + "Year | \n", + "Close_lag_1 | \n", + "Adj_Close_lag_1 | \n", + "hi_lo_range | \n", + "Returns | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| AADR | \n", + "2010-07-21 | \n", + "25.100000 | \n", + "25.100000 | \n", + "24.700001 | \n", + "24.700001 | \n", + "23.343714 | \n", + "42000.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "NaN | \n", + "NaN | \n", + "0.400000 | \n", + "NaN | \n", + "
| AADR | \n", + "2010-07-22 | \n", + "25.420000 | \n", + "25.420000 | \n", + "25.129999 | \n", + "25.260000 | \n", + "23.872967 | \n", + "17500.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "24.700001 | \n", + "23.343714 | \n", + "0.290001 | \n", + "0.022672 | \n", + "
| AADR | \n", + "2010-07-23 | \n", + "25.540001 | \n", + "25.540001 | \n", + "25.080000 | \n", + "25.280001 | \n", + "23.891865 | \n", + "8600.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.260000 | \n", + "23.872967 | \n", + "0.460001 | \n", + "0.000792 | \n", + "
| AADR | \n", + "2010-07-26 | \n", + "25.400000 | \n", + "25.400000 | \n", + "25.219999 | \n", + "25.370001 | \n", + "23.976921 | \n", + "18900.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.280001 | \n", + "23.891865 | \n", + "0.180000 | \n", + "0.003560 | \n", + "
| AADR | \n", + "2010-07-27 | \n", + "25.250000 | \n", + "25.290001 | \n", + "25.200001 | \n", + "25.290001 | \n", + "23.901318 | \n", + "8200.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.370001 | \n", + "23.976921 | \n", + "0.090000 | \n", + "-0.003153 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| ZIXI | \n", + "2020-03-26 | \n", + "4.060000 | \n", + "4.530000 | \n", + "3.880000 | \n", + "4.510000 | \n", + "4.510000 | \n", + "1668500.0 | \n", + "ZIXI.csv | \n", + "2020 | \n", + "4.000000 | \n", + "4.000000 | \n", + "0.650000 | \n", + "0.127500 | \n", + "
| ZIXI | \n", + "2020-03-27 | \n", + "4.490000 | \n", + "4.710000 | \n", + "4.100000 | \n", + "4.600000 | \n", + "4.600000 | \n", + "1146800.0 | \n", + "ZIXI.csv | \n", + "2020 | \n", + "4.510000 | \n", + "4.510000 | \n", + "0.610000 | \n", + "0.019956 | \n", + "
| ZIXI | \n", + "2020-03-30 | \n", + "4.830000 | \n", + "4.870000 | \n", + "4.440000 | \n", + "4.640000 | \n", + "4.640000 | \n", + "1212000.0 | \n", + "ZIXI.csv | \n", + "2020 | \n", + "4.600000 | \n", + "4.600000 | \n", + "0.430000 | \n", + "0.008696 | \n", + "
| ZIXI | \n", + "2020-03-31 | \n", + "4.600000 | \n", + "4.690000 | \n", + "4.100000 | \n", + "4.310000 | \n", + "4.310000 | \n", + "1057200.0 | \n", + "ZIXI.csv | \n", + "2020 | \n", + "4.640000 | \n", + "4.640000 | \n", + "0.590000 | \n", + "-0.071121 | \n", + "
| ZIXI | \n", + "2020-04-01 | \n", + "4.110000 | \n", + "4.160000 | \n", + "3.800000 | \n", + "3.820000 | \n", + "3.820000 | \n", + "539500.0 | \n", + "ZIXI.csv | \n", + "2020 | \n", + "4.310000 | \n", + "4.310000 | \n", + "0.360000 | \n", + "-0.113689 | \n", + "
334040 rows × 13 columns
\n", + "| \n", + " | Date | \n", + "Open | \n", + "High | \n", + "Low | \n", + "Close | \n", + "Adj Close | \n", + "Volume | \n", + "source | \n", + "Year | \n", + "Close_lag_1 | \n", + "Adj_Close_lag_1 | \n", + "hi_lo_range | \n", + "Returns | \n", + "returns_ma_10 | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| AADR | \n", + "2010-07-21 | \n", + "25.100000 | \n", + "25.100000 | \n", + "24.700001 | \n", + "24.700001 | \n", + "23.343714 | \n", + "42000.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "NaN | \n", + "NaN | \n", + "0.400000 | \n", + "NaN | \n", + "NaN | \n", + "
| AADR | \n", + "2010-07-22 | \n", + "25.420000 | \n", + "25.420000 | \n", + "25.129999 | \n", + "25.260000 | \n", + "23.872967 | \n", + "17500.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "24.700001 | \n", + "23.343714 | \n", + "0.290001 | \n", + "0.022672 | \n", + "NaN | \n", + "
| AADR | \n", + "2010-07-23 | \n", + "25.540001 | \n", + "25.540001 | \n", + "25.080000 | \n", + "25.280001 | \n", + "23.891865 | \n", + "8600.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.260000 | \n", + "23.872967 | \n", + "0.460001 | \n", + "0.000792 | \n", + "NaN | \n", + "
| AADR | \n", + "2010-07-26 | \n", + "25.400000 | \n", + "25.400000 | \n", + "25.219999 | \n", + "25.370001 | \n", + "23.976921 | \n", + "18900.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.280001 | \n", + "23.891865 | \n", + "0.180000 | \n", + "0.003560 | \n", + "NaN | \n", + "
| AADR | \n", + "2010-07-27 | \n", + "25.250000 | \n", + "25.290001 | \n", + "25.200001 | \n", + "25.290001 | \n", + "23.901318 | \n", + "8200.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.370001 | \n", + "23.976921 | \n", + "0.090000 | \n", + "-0.003153 | \n", + "NaN | \n", + "
| AADR | \n", + "2010-07-28 | \n", + "25.250000 | \n", + "25.290001 | \n", + "25.120001 | \n", + "25.200001 | \n", + "23.816256 | \n", + "4900.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.290001 | \n", + "23.901318 | \n", + "0.170000 | \n", + "-0.003559 | \n", + "NaN | \n", + "
| AADR | \n", + "2010-07-29 | \n", + "25.299999 | \n", + "25.299999 | \n", + "25.020000 | \n", + "25.020000 | \n", + "23.646139 | \n", + "1200.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.200001 | \n", + "23.816256 | \n", + "0.279999 | \n", + "-0.007143 | \n", + "NaN | \n", + "
| AADR | \n", + "2010-07-30 | \n", + "24.990000 | \n", + "25.100000 | \n", + "24.990000 | \n", + "25.100000 | \n", + "23.721750 | \n", + "600.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.020000 | \n", + "23.646139 | \n", + "0.110001 | \n", + "0.003197 | \n", + "NaN | \n", + "
| AADR | \n", + "2010-08-02 | \n", + "25.700001 | \n", + "25.709999 | \n", + "25.440001 | \n", + "25.620001 | \n", + "24.213196 | \n", + "7000.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.100000 | \n", + "23.721750 | \n", + "0.269999 | \n", + "0.020717 | \n", + "NaN | \n", + "
| AADR | \n", + "2010-08-03 | \n", + "25.750000 | \n", + "25.860001 | \n", + "25.750000 | \n", + "25.830000 | \n", + "24.411661 | \n", + "11200.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.620001 | \n", + "24.213196 | \n", + "0.110001 | \n", + "0.008197 | \n", + "NaN | \n", + "
| AADR | \n", + "2010-08-04 | \n", + "25.860001 | \n", + "25.860001 | \n", + "25.860001 | \n", + "25.860001 | \n", + "24.440016 | \n", + "500.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.830000 | \n", + "24.411661 | \n", + "0.000000 | \n", + "0.001161 | \n", + "0.004644 | \n", + "
| AADR | \n", + "2010-08-05 | \n", + "25.910000 | \n", + "25.910000 | \n", + "25.910000 | \n", + "25.910000 | \n", + "24.487267 | \n", + "500.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.860001 | \n", + "24.440016 | \n", + "0.000000 | \n", + "0.001933 | \n", + "0.002570 | \n", + "
| AADR | \n", + "2010-08-06 | \n", + "25.309999 | \n", + "25.709999 | \n", + "25.309999 | \n", + "25.709999 | \n", + "24.298254 | \n", + "700.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.910000 | \n", + "24.487267 | \n", + "0.400000 | \n", + "-0.007719 | \n", + "0.001719 | \n", + "
| AADR | \n", + "2010-08-09 | \n", + "26.000000 | \n", + "26.230000 | \n", + "26.000000 | \n", + "26.059999 | \n", + "24.629036 | \n", + "5500.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.709999 | \n", + "24.298254 | \n", + "0.230000 | \n", + "0.013613 | \n", + "0.002725 | \n", + "
| AADR | \n", + "2010-08-10 | \n", + "26.080000 | \n", + "26.080000 | \n", + "26.080000 | \n", + "26.080000 | \n", + "24.647932 | \n", + "500.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "26.059999 | \n", + "24.629036 | \n", + "0.000000 | \n", + "0.000767 | \n", + "0.003117 | \n", + "
| AADR | \n", + "2010-08-11 | \n", + "25.170000 | \n", + "25.170000 | \n", + "25.170000 | \n", + "25.170000 | \n", + "23.787901 | \n", + "100.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "26.080000 | \n", + "24.647932 | \n", + "0.000000 | \n", + "-0.034893 | \n", + "-0.000017 | \n", + "
| AADR | \n", + "2010-08-12 | \n", + "25.040001 | \n", + "25.040001 | \n", + "25.040001 | \n", + "25.040001 | \n", + "23.665043 | \n", + "1000.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.170000 | \n", + "23.787901 | \n", + "0.000000 | \n", + "-0.005165 | \n", + "0.000181 | \n", + "
| AADR | \n", + "2010-08-13 | \n", + "25.170000 | \n", + "25.170000 | \n", + "25.170000 | \n", + "25.170000 | \n", + "23.787901 | \n", + "100.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.040001 | \n", + "23.665043 | \n", + "0.000000 | \n", + "0.005192 | \n", + "0.000380 | \n", + "
| AADR | \n", + "2010-08-16 | \n", + "25.280001 | \n", + "25.280001 | \n", + "25.240000 | \n", + "25.240000 | \n", + "23.854063 | \n", + "3000.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.170000 | \n", + "23.787901 | \n", + "0.040001 | \n", + "0.002781 | \n", + "-0.001413 | \n", + "
| AADR | \n", + "2010-08-17 | \n", + "25.570000 | \n", + "25.750000 | \n", + "25.570000 | \n", + "25.750000 | \n", + "24.336056 | \n", + "6500.0 | \n", + "AADR.csv | \n", + "2010 | \n", + "25.240000 | \n", + "23.854063 | \n", + "0.180000 | \n", + "0.020206 | \n", + "-0.000212 | \n", + "