From 3b471a0fe3ad1db2eef0d298dc7329feeb8d7d2c Mon Sep 17 00:00:00 2001 From: you-n-g Date: Fri, 11 Nov 2022 10:25:04 +0800 Subject: [PATCH 01/15] Fix CI (#1347) --- qlib/rl/order_execution/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qlib/rl/order_execution/__init__.py b/qlib/rl/order_execution/__init__.py index 318c774230d..b985c13317b 100644 --- a/qlib/rl/order_execution/__init__.py +++ b/qlib/rl/order_execution/__init__.py @@ -16,8 +16,8 @@ from .policy import AllOne, PPO from .reward import PAPenaltyReward from .simulator_simple import SingleAssetOrderExecutionSimple -from .state import SAOEStateAdapter, SAOEMetrics, SAOEState -from .strategy import SAOEStrategy, ProxySAOEStrategy, SAOEIntStrategy +from .state import SAOEMetrics, SAOEState +from .strategy import SAOEStateAdapter, SAOEStrategy, ProxySAOEStrategy, SAOEIntStrategy __all__ = [ "FullHistoryStateInterpreter", From a82cc0b12963e989fd33527bce666aabfa21ce75 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Fri, 11 Nov 2022 19:35:10 +0800 Subject: [PATCH 02/15] update TSDataSampler refineing the memory layout of data array to speed up NN training (#1342) * update TSDataSampler * reformat code with black * use pre-commit to reformat the code * Add documents * More docstring * More Safety Co-authored-by: Young --- qlib/data/dataset/__init__.py | 151 ++++++++++++++++++++++++++++------ 1 file changed, 127 insertions(+), 24 deletions(-) diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index 5e98bfc97af..dcc9957ed63 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -82,7 +82,11 @@ class DatasetH(Dataset): """ def __init__( - self, handler: Union[Dict, DataHandler], segments: Dict[Text, Tuple], fetch_kwargs: Dict = {}, **kwargs + self, + handler: Union[Dict, DataHandler], + segments: Dict[Text, Tuple], + fetch_kwargs: Dict = {}, + **kwargs, ): """ Setup the underlying data. @@ -284,10 +288,69 @@ class TSDataSampler: - For performance issues, this Sampler will convert dataframe into arrays for better performance. This could result in a different data type + + Indices design: + TSDataSampler has a index mechanism to help users query time-series data efficiently. + + The definition of related variables: + data_arr: np.ndarray + The original data. it will contains all the original data. + The querying are often for time-series of a specific stock. + By leveraging this data charactoristics to speed up querying, the multi-index of data_arr is rearranged in (instrument, datetime) order + + data_index: pd.MultiIndex with index order + it has the same shape with `idx_map`. Each elements of them are expected to be aligned. + + idx_map: np.ndarray + It is the indexable data. It originates from data_arr, and then filtered by 1) `start` and `end` 2) `flt_data` + The extra data in data_arr is useful in following cases + 1) creating meaningful time series data before `start` instead of padding them with zeros + 2) some data are excluded by `flt_data` (e.g. no sample pair for that index). but they are still used in time-series in X + + Finnally, it will look like. + + array([[ 0, 0], + [ 1, 0], + [ 2, 0], + ..., + [241, 348], + [242, 348], + [243, 348]], dtype=int32) + + It list all indexable data(some data only used in historical time series data may not be indexabla), the values are the corresponding row and col in idx_df + idx_df: pd.DataFrame + It aims to map the key to the original position in data_arr + + For example, it may look like (NOTE: the index for a instrument time-series is continoues in memory) + + instrument SH600000 SH600008 SH600009 SH600010 SH600011 SH600015 ... + datetime + 2017-01-03 0 242 473 717 NaN 974 ... + 2017-01-04 1 243 474 718 NaN 975 ... + 2017-01-05 2 244 475 719 NaN 976 ... + 2017-01-06 3 245 476 720 NaN 977 ... + + With these two indices(idx_map, idx_df) and original data(data_arr), we can make the following queries fast (implemented in __getitem__) + (1) Get the i-th indexable sample(time-series): (indexable sample index) -> [idx_map] -> (row col) -> [idx_df] -> (index in data_arr) + (2) Get the specific sample by : (, i.e. ) -> [idx_df] -> (index in data_arr) + (3) Get the index of a time-series data: (get the , refer to (1), (2)) -> [idx_df] -> (all indices in data_arr for time-series) """ + # Please refer to the docstring of TSDataSampler for the definition of following attributes + data_arr: np.ndarray + data_index: pd.MultiIndex + idx_map: np.ndarray + idx_df: pd.DataFrame + def __init__( - self, data: pd.DataFrame, start, end, step_len: int, fillna_type: str = "none", dtype=None, flt_data=None + self, + data: pd.DataFrame, + start, + end, + step_len: int, + fillna_type: str = "none", + dtype=None, + flt_data=None, ): """ Build a dataset which looks like torch.data.utils.Dataset. @@ -295,7 +358,7 @@ def __init__( Parameters ---------- data : pd.DataFrame - The raw tabular data + The raw tabular data whose index order is <"datetime", "instrument"> start : The indexable start time end : @@ -311,7 +374,7 @@ def __init__( ffill+bfill: ffill with previous samples first and fill with later samples second flt_data : pd.Series - a column of data(True or False) to filter data. + a column of data(True or False) to filter data. Its index order is <"datetime", "instrument"> None: kepp all data @@ -321,7 +384,10 @@ def __init__( self.step_len = step_len self.fillna_type = fillna_type assert get_level_index(data, "datetime") == 0 - self.data = lazy_sort_index(data) + self.data = data.swaplevel().sort_index().copy() + data.drop( + data.columns, axis=1, inplace=True + ) # data is useless since it's passed to a transposed one, hard code to free the memory of this dataframe to avoid three big dataframe in the memory(including: data, self.data, self.data_arr) kwargs = {"object": self.data} if dtype is not None: @@ -332,7 +398,9 @@ def __init__( # - append last line with full NaN for better performance in `__getitem__` # - Keep the same dtype will result in a better performance self.data_arr = np.append( - self.data_arr, np.full((1, self.data_arr.shape[1]), np.nan, dtype=self.data_arr.dtype), axis=0 + self.data_arr, + np.full((1, self.data_arr.shape[1]), np.nan, dtype=self.data_arr.dtype), + axis=0, ) self.nan_idx = -1 # The last line is all NaN @@ -347,19 +415,36 @@ def __init__( flt_data = flt_data.iloc[:, 0] # NOTE: bool(np.nan) is True !!!!!!!! # make sure reindex comes first. Otherwise extra NaN may appear. + flt_data = flt_data.swaplevel() flt_data = flt_data.reindex(self.data_index).fillna(False).astype(np.bool) self.flt_data = flt_data.values self.idx_map = self.flt_idx_map(self.flt_data, self.idx_map) self.data_index = self.data_index[np.where(self.flt_data)[0]] self.idx_map = self.idx_map2arr(self.idx_map) - - self.start_idx, self.end_idx = self.data_index.slice_locs( - start=time_to_slc_point(start), end=time_to_slc_point(end) + self.idx_map, self.data_index = self.slice_idx_map_and_data_index( + self.idx_map, self.idx_df, self.data_index, start, end ) - self.idx_arr = np.array(self.idx_df.values, dtype=np.float64) # for better performance + self.idx_arr = np.array(self.idx_df.values, dtype=np.float64) # for better performance del self.data # save memory + @staticmethod + def slice_idx_map_and_data_index( + idx_map, + idx_df, + data_index, + start, + end, + ): + assert ( + len(idx_map) == data_index.shape[0] + ) # make sure idx_map and data_index is same so index of idx_map can be used on data_index + + start_row_idx, end_row_idx = idx_df.index.slice_locs(start=time_to_slc_point(start), end=time_to_slc_point(end)) + + time_flter_idx = (idx_map[:, 0] < end_row_idx) & (idx_map[:, 0] >= start_row_idx) + return idx_map[time_flter_idx], data_index[time_flter_idx] + @staticmethod def idx_map2arr(idx_map): # pytorch data sampler will have better memory control without large dict or list @@ -394,7 +479,7 @@ def get_index(self): Get the pandas index of the data, it will be useful in following scenarios - Special sampler will be used (e.g. user want to sample day by day) """ - return self.data_index[self.start_idx : self.end_idx] + return self.data_index.swaplevel() # to align the order of multiple index of original data received by __init__ def config(self, **kwargs): # Config the attributes @@ -409,25 +494,33 @@ def build_index(data: pd.DataFrame) -> Tuple[pd.DataFrame, dict]: Parameters ---------- data : pd.DataFrame - The dataframe with + A DataFrame with index in order + + RSQR5 RESI5 WVMA5 LABEL0 + instrument datetime + SH600000 2017-01-03 0.016389 0.461632 -1.154788 -0.048056 + 2017-01-04 0.884545 -0.110597 -1.059332 -0.030139 + 2017-01-05 0.507540 -0.535493 -1.099665 -0.644983 + 2017-01-06 -1.267771 -0.669685 -1.636733 0.295366 + 2017-01-09 0.339346 0.074317 -0.984989 0.765540 Returns ------- Tuple[pd.DataFrame, dict]: 1) the first element: reshape the original index into a 2D dataframe - instrument SH600000 SH600004 SH600006 SH600007 SH600008 SH600009 ... + instrument SH600000 SH600008 SH600009 SH600010 SH600011 SH600015 ... datetime - 2021-01-11 0 1 2 3 4 5 ... - 2021-01-12 4146 4147 4148 4149 4150 4151 ... - 2021-01-13 8293 8294 8295 8296 8297 8298 ... - 2021-01-14 12441 12442 12443 12444 12445 12446 ... + 2017-01-03 0 242 473 717 NaN 974 ... + 2017-01-04 1 243 474 718 NaN 975 ... + 2017-01-05 2 244 475 719 NaN 976 ... + 2017-01-06 3 245 476 720 NaN 977 ... 2) the second element: {: } """ # object incase of pandas converting int to float idx_df = pd.Series(range(data.shape[0]), index=data.index, dtype=object) idx_df = lazy_sort_index(idx_df.unstack()) # NOTE: the correctness of `__getitem__` depends on columns sorted here - idx_df = lazy_sort_index(idx_df, axis=1) + idx_df = lazy_sort_index(idx_df, axis=1).T idx_map = {} for i, (_, row) in enumerate(idx_df.iterrows()): @@ -485,11 +578,11 @@ def _get_row_col(self, idx) -> Tuple[int]: """ # The the right row number `i` and col number `j` in idx_df if isinstance(idx, (int, np.integer)): - real_idx = self.start_idx + idx - if self.start_idx <= real_idx < self.end_idx: + real_idx = idx + if 0 <= real_idx < len(self.idx_map): i, j = self.idx_map[real_idx] # TODO: The performance of this line is not good else: - raise KeyError(f"{real_idx} is out of [{self.start_idx}, {self.end_idx})") + raise KeyError(f"{real_idx} is out of [0, {len(self.idx_map)})") elif isinstance(idx, tuple): # ["datetime", "instruments"] date, inst = idx @@ -532,7 +625,10 @@ def __getitem__(self, idx: Union[int, Tuple[object, str], List[int]]): # precision problems. It will not cause any problems in my tests at least indices = np.nan_to_num(indices.astype(np.float64), nan=self.nan_idx).astype(int) - data = self.data_arr[indices] + if (np.diff(indices) == 1).all(): # slicing instead of indexing for speeding up. + data = self.data_arr[indices[0] : indices[-1] + 1] + else: + data = self.data_arr[indices] if isinstance(idx, mtit): # if we get multiple indexes, addition dimension should be added. # @@ -540,7 +636,7 @@ def __getitem__(self, idx: Union[int, Tuple[object, str], List[int]]): return data def __len__(self): - return self.end_idx - self.start_idx + return len(self.idx_map) class TSDatasetH(DatasetH): @@ -611,7 +707,14 @@ def _prepare_seg(self, slc: slice, **kwargs) -> TSDataSampler: else: flt_data = None - tsds = TSDataSampler(data=data, start=start, end=end, step_len=self.step_len, dtype=dtype, flt_data=flt_data) + tsds = TSDataSampler( + data=data, + start=start, + end=end, + step_len=self.step_len, + dtype=dtype, + flt_data=flt_data, + ) return tsds From ff2154c618ae972f704164beb094dcc330c375c4 Mon Sep 17 00:00:00 2001 From: He Yi Date: Fri, 11 Nov 2022 19:53:33 +0800 Subject: [PATCH 03/15] fix bug in fix clip_outlier in class RobustZScoreNorm(Processor) (#1294) --- qlib/data/dataset/processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py index 26ff7e09dad..b7abb200029 100644 --- a/qlib/data/dataset/processor.py +++ b/qlib/data/dataset/processor.py @@ -289,9 +289,9 @@ def __call__(self, df): X = df[self.cols] X -= self.mean_train X /= self.std_train - df[self.cols] = X if self.clip_outlier: - df.clip(-3, 3, inplace=True) + X = np.clip(X, -3, 3) + df[self.cols] = X return df From 4001a5d1571cb622315dd6c7ff6034ad42d6cd17 Mon Sep 17 00:00:00 2001 From: qianyun210603 Date: Sun, 13 Nov 2022 19:03:23 +0800 Subject: [PATCH 04/15] Bug fix for Rank and WMA operators (#1228) * bug fix: 1) 100 should be used to scale down percentileofscore return to 0-1, not length of array; 2) for (linear) weighted MA(n), weight should be n, n-1, ..., 1 instead of n-1, ..., 0 * use native pandas fucntion for rank * remove useless import * require pandas 1.4+ * rank for py37+pandas 1.3.5 compatibility * lint improvement * lint black fix * use hasattr instead of version to check whether rolling.rank is implemented --- qlib/data/ops.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/qlib/data/ops.py b/qlib/data/ops.py index 1cbb1d2e628..fe2ebc9f6d9 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -34,8 +34,6 @@ #################### Element-Wise Operator #################### - - class ElemOperator(ExpressionOps): """Element-wise Operator @@ -216,9 +214,7 @@ class Not(NpElemOperator): Parameters ---------- - feature_left : Expression - feature instance - feature_right : Expression + feature : Expression feature instance Returns @@ -241,8 +237,6 @@ class PairOperator(ExpressionOps): feature instance or numeric value feature_right : Expression feature instance or numeric value - func : str - operator function Returns ---------- @@ -1155,9 +1149,13 @@ class Rank(Rolling): def __init__(self, feature, N): super(Rank, self).__init__(feature, N, "rank") + # for compatiblity of python 3.7, which doesn't support pandas 1.4.0+ which implements Rolling.rank def _load_internal(self, instrument, start_index, end_index, *args): series = self.feature.load(instrument, start_index, end_index, *args) - # TODO: implement in Cython + + rolling_or_expending = series.expanding(min_periods=1) if self.N == 0 else series.rolling(self.N, min_periods=1) + if hasattr(rolling_or_expending, "rank"): + return rolling_or_expending.rank(pct=True) def rank(x): if np.isnan(x[-1]): @@ -1165,13 +1163,9 @@ def rank(x): x1 = x[~np.isnan(x)] if x1.shape[0] == 0: return np.nan - return percentileofscore(x1, x1[-1]) / len(x1) + return percentileofscore(x1, x1[-1]) / 100 - if self.N == 0: - series = series.expanding(min_periods=1).apply(rank, raw=True) - else: - series = series.rolling(self.N, min_periods=1).apply(rank, raw=True) - return series + return rolling_or_expending.apply(rank, raw=True) class Count(Rolling): @@ -1341,7 +1335,7 @@ def _load_internal(self, instrument, start_index, end_index, *args): # TODO: implement in Cython def weighted_mean(x): - w = np.arange(len(x)) + w = np.arange(len(x)) + 1 w = w / w.sum() return np.nanmean(w * x) From 82afd6a67aba1769ce3b03d1e600de378a8f7ec3 Mon Sep 17 00:00:00 2001 From: Maxim Smolskiy Date: Sun, 13 Nov 2022 17:07:08 +0300 Subject: [PATCH 05/15] Fix the Warnings in rst files when building Qlib's documentation (#1349) * Fix docs/advanced/alpha.rst * Fix docs/reference/api.rst * Fix docs/component/strategy.rst * Fix docs/start/integration.rst * Fix docs/component/report.rst * Fix docs/component/data.rst * Fix docs/component/rl/framework.rst * Fix docs/introduction/quick.rst * Fix docs/advanced/task_management.rst * Fix CHANGES.rst * Fix docs/developer/code_standard_and_dev_guide.rst * Fix docs/hidden/client.rst * Fix docs/component/online.rst * Fix docs/start/getdata.rst * Add docs/hidden to exclude patterns * Add docs/developer/code_standard_and_dev_guide.rst to index.rst * Change docs/developer/code_standard_and_dev_guide.rst place in index.rst --- CHANGES.rst | 12 +- docs/advanced/alpha.rst | 2 +- docs/advanced/task_management.rst | 6 +- docs/component/data.rst | 10 +- docs/component/online.rst | 2 +- docs/component/report.rst | 1 + docs/component/rl/framework.rst | 4 +- docs/component/strategy.rst | 1 + docs/conf.py | 2 +- .../developer/code_standard_and_dev_guide.rst | 9 +- docs/hidden/client.rst | 3 +- docs/index.rst | 6 + docs/introduction/quick.rst | 1 + docs/reference/api.rst | 1 + docs/start/getdata.rst | 34 ++-- docs/start/integration.rst | 152 +++++++++--------- 16 files changed, 131 insertions(+), 115 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 3e94dc44e38..76aa4829304 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -85,7 +85,7 @@ Version 0.4.0 ------------- - Add `data` package that holds all data-related codes - Reform the data provider structure -- Create a server for data centralized management `qlib-server`_ +- Create a server for data centralized management `qlib-server `_ - Add a `ClientProvider` to work with server - Add a pluggable cache mechanism - Add a recursive backtracking algorithm to inspect the furthest reference date for an expression @@ -166,12 +166,12 @@ Version 0.8.0 - Nested decision execution framework is supported - There are lots of changes for daily trading, it is hard to list all of them. But a few important changes could be noticed - The trading limitation is more accurate; - - In `previous version `_, longing and shorting actions share the same action. - - In `current version `_, the trading limitation is different between logging and shorting action. + - In `previous version `__, longing and shorting actions share the same action. + - In `current version `__, the trading limitation is different between logging and shorting action. - The constant is different when calculating annualized metrics. - - `Current version `_ uses more accurate constant than `previous version `_ - - `A new version `_ of data is released. Due to the unstability of Yahoo data source, the data may be different after downloading data again. - - Users could check out the backtesting results between `Current version `_ and `previous version `_ + - `Current version `_ uses more accurate constant than `previous version `__ + - `A new version `__ of data is released. Due to the unstability of Yahoo data source, the data may be different after downloading data again. + - Users could check out the backtesting results between `Current version `__ and `previous version `__ Other Versions diff --git a/docs/advanced/alpha.rst b/docs/advanced/alpha.rst index 797eb19da54..88d65074c65 100644 --- a/docs/advanced/alpha.rst +++ b/docs/advanced/alpha.rst @@ -38,7 +38,7 @@ Example DIF = \frac{EMA(CLOSE, 12) - EMA(CLOSE, 26)}{CLOSE} - `DEA`means a 9-period EMA of the DIF. + `DEA` means a 9-period EMA of the DIF. .. math:: diff --git a/docs/advanced/task_management.rst b/docs/advanced/task_management.rst index d45c7b97d15..70b6bcfc860 100644 --- a/docs/advanced/task_management.rst +++ b/docs/advanced/task_management.rst @@ -18,7 +18,7 @@ With this module, users can run their ``task`` automatically at different period This whole process can be used in `Online Serving <../component/online.html>`_. -An example of the entire process is shown `here `_. +An example of the entire process is shown `here `__. Task Generating =============== @@ -33,7 +33,7 @@ Here is the base class of ``TaskGen``: :members: ``Qlib`` provides a class `RollingGen `_ to generate a list of ``task`` of the dataset in different date segments. -This class allows users to verify the effect of data from different periods on the model in one experiment. More information is `here <../reference/api.html#TaskGen>`_. +This class allows users to verify the effect of data from different periods on the model in one experiment. More information is `here <../reference/api.html#TaskGen>`__. Task Storing ============ @@ -54,7 +54,7 @@ Users need to provide the MongoDB URL and database name for using ``TaskManager` .. autoclass:: qlib.workflow.task.manage.TaskManager :members: -More information of ``Task Manager`` can be found in `here <../reference/api.html#TaskManager>`_. +More information of ``Task Manager`` can be found in `here <../reference/api.html#TaskManager>`__. Task Training ============= diff --git a/docs/component/data.rst b/docs/component/data.rst index b8279432e70..d3b8cafed21 100644 --- a/docs/component/data.rst +++ b/docs/component/data.rst @@ -24,8 +24,8 @@ The introduction of ``Data Layer`` includes the following parts. Here is a typical example of Qlib data workflow - Users download data and converting data into Qlib format(with filename suffix `.bin`). In this step, typically only some basic data are stored on disk(such as OHLCV). -- Creating some basic features based on Qlib's expression Engine(e.g. "Ref($close, 60) / $close", the return of last 60 trading days). Supported operators in the expression engine can be found `here `_. This step is typically implemented in Qlib's `Data Loader `_ which is a component of `Data Handler `_ . -- If users require more complicated data processing (e.g. data normalization), `Data Handler `_ support user-customized processors to process data(some predefined processors can be found `here `_). The processors are different from operators in expression engine. It is designed for some complicated data processing methods which is hard to supported in operators in expression engine. +- Creating some basic features based on Qlib's expression Engine(e.g. "Ref($close, 60) / $close", the return of last 60 trading days). Supported operators in the expression engine can be found `here `__. This step is typically implemented in Qlib's `Data Loader `_ which is a component of `Data Handler `_ . +- If users require more complicated data processing (e.g. data normalization), `Data Handler `_ support user-customized processors to process data(some predefined processors can be found `here `__). The processors are different from operators in expression engine. It is designed for some complicated data processing methods which is hard to supported in operators in expression engine. - At last, `Dataset `_ is responsible to prepare model-specific dataset from the processed data of Data Handler Data Preparation @@ -37,7 +37,7 @@ Qlib Format Data We've specially designed a data structure to manage financial data, please refer to the `File storage design section in Qlib paper `_ for detailed information. Such data will be stored with filename suffix `.bin` (We'll call them `.bin` file, `.bin` format, or qlib format). `.bin` file is designed for scientific computing on finance data. -``Qlib`` provides two different off-the-shelf datasets, which can be accessed through this `link `_: +``Qlib`` provides two different off-the-shelf datasets, which can be accessed through this `link `__: ======================== ================= ================ Dataset US Market China Market @@ -47,7 +47,7 @@ Alpha360 √ √ Alpha158 √ √ ======================== ================= ================ -Also, ``Qlib`` provides a high-frequency dataset. Users can run a high-frequency dataset example through this `link `_. +Also, ``Qlib`` provides a high-frequency dataset. Users can run a high-frequency dataset example through this `link `__. Qlib Format Dataset ------------------- @@ -512,7 +512,7 @@ Data and Cache File Structure We've specially designed a file structure to manage data and cache, please refer to the `File storage design section in Qlib paper `_ for detailed information. The file structure of data and cache is listed as follows. -.. code-block:: json +.. code-block:: - data/ [raw data] updated by data providers diff --git a/docs/component/online.rst b/docs/component/online.rst index c72c77a873d..351098db4d4 100644 --- a/docs/component/online.rst +++ b/docs/component/online.rst @@ -1,4 +1,4 @@ -.. _online: +.. _online_serving: ============== Online Serving diff --git a/docs/component/report.rst b/docs/component/report.rst index 6ed87eef057..30fca078836 100644 --- a/docs/component/report.rst +++ b/docs/component/report.rst @@ -174,6 +174,7 @@ Graphical Result The `Information Ratio` without cost. - `excess_return_with_cost` The `Information Ratio` with cost. + To know more about `Information Ratio`, please refer to `Information Ratio – IR `_. - `max_drawdown` - `excess_return_without_cost` diff --git a/docs/component/rl/framework.rst b/docs/component/rl/framework.rst index 7edb08efd90..a31cb1b4701 100644 --- a/docs/component/rl/framework.rst +++ b/docs/component/rl/framework.rst @@ -28,7 +28,7 @@ In QlibRL, EnvWrapper is a subclass of gym.Env, so it implements all necessary i EnvWrapper will organically organize these components. Such decomposition allows for better flexibility in development. For example, if the developers want to train multiple types of policies in the same environment, they only need to design one simulator and design different state interpreters/action interpreters/reward functions for different types of policies. -QlibRL has well-defined base classes for all these 4 components. All the developers need to do is define their own components by inheriting the base classes and then implementing all interfaces required by the base classes. The API for the above base components can be found `here <../../reference/api.html#module-qlib.rl>`_. +QlibRL has well-defined base classes for all these 4 components. All the developers need to do is define their own components by inheriting the base classes and then implementing all interfaces required by the base classes. The API for the above base components can be found `here <../../reference/api.html#module-qlib.rl>`__. Policy ------------ @@ -42,4 +42,4 @@ As you may have noticed, a training vessel itself holds all the required compone With a training vessel, the trainer could finally launch the training pipeline by simple, Scikit-learn-like interfaces (i.e., ``trainer.fit()``). -The API for Trainer and TrainingVessel and can be found `here <../../reference/api.html#module-qlib.rl.trainer>`_. \ No newline at end of file +The API for Trainer and TrainingVessel and can be found `here <../../reference/api.html#module-qlib.rl.trainer>`__. \ No newline at end of file diff --git a/docs/component/strategy.rst b/docs/component/strategy.rst index 919551fb314..910ebf7083b 100644 --- a/docs/component/strategy.rst +++ b/docs/component/strategy.rst @@ -80,6 +80,7 @@ TopkDropoutStrategy In most cases, ``TopkDrop`` algorithm sells and buys `Drop` stocks every trading day, which yields a turnover rate of 2$\times$`Drop`/$K$. The following images illustrate a typical scenario. + .. image:: ../_static/img/topk_drop.png :alt: Topk-Drop diff --git a/docs/conf.py b/docs/conf.py index a7147a964a1..442c89da2da 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -77,7 +77,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "hidden"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" diff --git a/docs/developer/code_standard_and_dev_guide.rst b/docs/developer/code_standard_and_dev_guide.rst index ae5927c837e..79a7778ad1a 100644 --- a/docs/developer/code_standard_and_dev_guide.rst +++ b/docs/developer/code_standard_and_dev_guide.rst @@ -15,7 +15,8 @@ Continuous Integration (CI) tools help you stick to the quality standards by run When you submit a PR request, you can check whether your code passes the CI tests in the "check" section at the bottom of the web page. 1. Qlib will check the code format with black. The PR will raise error if your code does not align to the standard of Qlib(e.g. a common error is the mixed use of space and tab). - You can fix the bug by inputing the following code in the command line. + + You can fix the bug by inputing the following code in the command line. .. code-block:: bash @@ -32,7 +33,8 @@ When you submit a PR request, you can check whether your code passes the CI test 3. Qlib will check your code style flake8. The checking command is implemented in [github action workflow](https://github.com/microsoft/qlib/blob/0e8b94a552f1c457cfa6cd2c1bb3b87ebb3fb279/.github/workflows/test.yml#L73). - You can fix the bug by inputing the following code in the command line. + + You can fix the bug by inputing the following code in the command line. .. code-block:: bash @@ -40,7 +42,8 @@ When you submit a PR request, you can check whether your code passes the CI test 4. Qlib has integrated pre-commit, which will make it easier for developers to format their code. - Just run the following two commands, and the code will be automatically formatted using black and flake8 when the git commit command is executed. + + Just run the following two commands, and the code will be automatically formatted using black and flake8 when the git commit command is executed. .. code-block:: bash diff --git a/docs/hidden/client.rst b/docs/hidden/client.rst index 7ca0d68013a..de6e2e681e8 100644 --- a/docs/hidden/client.rst +++ b/docs/hidden/client.rst @@ -81,6 +81,7 @@ If running on Windows, open **NFS** features and write correct **mount_path**, i * Open ``Programs and Features``. * Click ``Turn Windows features on or off``. * Scroll down and check the option ``Services for NFS``, then click OK + Reference address: https://graspingtech.com/mount-nfs-share-windows-10/ 2.config correct mount_path * In windows, mount path must be not exist path and root path, @@ -161,7 +162,7 @@ Limitations API *** -The client is based on `python-socketio`_ which is a framework that supports WebSocket client for Python language. The client can only propose requests and receive results, which do not include any calculating procedure. +The client is based on `python-socketio `_ which is a framework that supports WebSocket client for Python language. The client can only propose requests and receive results, which do not include any calculating procedure. Class ----- diff --git a/docs/index.rst b/docs/index.rst index 0d8cad81ada..3adf9049a76 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -56,6 +56,12 @@ Document Structure Task Management Point-In-Time database +.. toctree:: + :maxdepth: 3 + :caption: FOR DEVELOPERS: + + Code Standard & Development Guidance + .. toctree:: :maxdepth: 3 :caption: REFERENCE: diff --git a/docs/introduction/quick.rst b/docs/introduction/quick.rst index 364e58caf12..78d9c2083f6 100644 --- a/docs/introduction/quick.rst +++ b/docs/introduction/quick.rst @@ -21,6 +21,7 @@ Users can easily intsall ``Qlib`` according to the following steps: - Before installing ``Qlib`` from source, users need to install some dependencies: .. code-block:: + pip install numpy pip install --upgrade cython diff --git a/docs/reference/api.rst b/docs/reference/api.rst index 98f50fc281e..4e6a7a85432 100644 --- a/docs/reference/api.rst +++ b/docs/reference/api.rst @@ -1,4 +1,5 @@ .. _api: + ============= API Reference ============= diff --git a/docs/start/getdata.rst b/docs/start/getdata.rst index 8849eb87cc5..cea9c2b0dc4 100644 --- a/docs/start/getdata.rst +++ b/docs/start/getdata.rst @@ -83,15 +83,14 @@ Load features of certain instruments in a given time range: >> from qlib.data import D >> instruments = ['SH600000'] >> fields = ['$close', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low'] - >> D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day').head() - - $close $volume Ref($close, 1) Mean($close, 3) $high-$low - instrument datetime - SH600000 2010-01-04 86.778313 16162960.0 88.825928 88.061483 2.907631 - 2010-01-05 87.433578 28117442.0 86.778313 87.679273 3.235252 - 2010-01-06 85.713585 23632884.0 87.433578 86.641825 1.720009 - 2010-01-07 83.788803 20813402.0 85.713585 85.645322 3.030487 - 2010-01-08 84.730675 16044853.0 83.788803 84.744354 2.047623 + >> D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day').head().to_string() + ' $close $volume Ref($close, 1) Mean($close, 3) $high-$low + ... instrument datetime + ... SH600000 2010-01-04 86.778313 16162960.0 88.825928 88.061483 2.907631 + ... 2010-01-05 87.433578 28117442.0 86.778313 87.679273 3.235252 + ... 2010-01-06 85.713585 23632884.0 87.433578 86.641825 1.720009 + ... 2010-01-07 83.788803 20813402.0 85.713585 85.645322 3.030487 + ... 2010-01-08 84.730675 16044853.0 83.788803 84.744354 2.047623' Load features of certain stock pool in a given time range: @@ -105,15 +104,14 @@ Load features of certain stock pool in a given time range: >> expressionDFilter = ExpressionDFilter(rule_expression='$close>Ref($close,1)') >> instruments = D.instruments(market='csi300', filter_pipe=[nameDFilter, expressionDFilter]) >> fields = ['$close', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low'] - >> D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day').head() - - $close $volume Ref($close, 1) Mean($close, 3) $high-$low - instrument datetime - SH600655 2010-01-04 2699.567383 158193.328125 2619.070312 2626.097738 124.580566 - 2010-01-08 2612.359619 77501.406250 2584.567627 2623.220133 83.373047 - 2010-01-11 2712.982422 160852.390625 2612.359619 2636.636556 146.621582 - 2010-01-12 2788.688232 164587.937500 2712.982422 2704.676758 128.413818 - 2010-01-13 2790.604004 145460.453125 2788.688232 2764.091553 128.413818 + >> D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day').head().to_string() + ' $close $volume Ref($close, 1) Mean($close, 3) $high-$low + ... instrument datetime + ... SH600655 2010-01-04 2699.567383 158193.328125 2619.070312 2626.097738 124.580566 + ... 2010-01-08 2612.359619 77501.406250 2584.567627 2623.220133 83.373047 + ... 2010-01-11 2712.982422 160852.390625 2612.359619 2636.636556 146.621582 + ... 2010-01-12 2788.688232 164587.937500 2712.982422 2704.676758 128.413818 + ... 2010-01-13 2790.604004 145460.453125 2788.688232 2764.091553 128.413818' For more details about features, please refer `Feature API <../component/data.html>`_. diff --git a/docs/start/integration.rst b/docs/start/integration.rst index 801bb819d17..a9eecc4ead8 100644 --- a/docs/start/integration.rst +++ b/docs/start/integration.rst @@ -21,84 +21,88 @@ The Custom models need to inherit `qlib.model.base.Model <../reference/api.html# - ``Qlib`` passes the initialized parameters to the \_\_init\_\_ method. - The hyperparameters of model in the configuration must be consistent with those defined in the `__init__` method. - Code Example: In the following example, the hyperparameters of model in the configuration file should contain parameters such as `loss:mse`. - .. code-block:: Python - def __init__(self, loss='mse', **kwargs): - if loss not in {'mse', 'binary'}: - raise NotImplementedError - self._scorer = mean_squared_error if loss == 'mse' else roc_auc_score - self._params.update(objective=loss, **kwargs) - self._model = None + .. code-block:: Python + + def __init__(self, loss='mse', **kwargs): + if loss not in {'mse', 'binary'}: + raise NotImplementedError + self._scorer = mean_squared_error if loss == 'mse' else roc_auc_score + self._params.update(objective=loss, **kwargs) + self._model = None - Override the `fit` method - ``Qlib`` calls the fit method to train the model. - The parameters must include training feature `dataset`, which is designed in the interface. - The parameters could include some `optional` parameters with default values, such as `num_boost_round = 1000` for `GBDT`. - Code Example: In the following example, `num_boost_round = 1000` is an optional parameter. - .. code-block:: Python - - def fit(self, dataset: DatasetH, num_boost_round = 1000, **kwargs): - - # prepare dataset for lgb training and evaluation - df_train, df_valid = dataset.prepare( - ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L - ) - x_train, y_train = df_train["feature"], df_train["label"] - x_valid, y_valid = df_valid["feature"], df_valid["label"] - - # Lightgbm need 1D array as its label - if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: - y_train, y_valid = np.squeeze(y_train.values), np.squeeze(y_valid.values) - else: - raise ValueError("LightGBM doesn't support multi-label training") - - dtrain = lgb.Dataset(x_train.values, label=y_train) - dvalid = lgb.Dataset(x_valid.values, label=y_valid) - - # fit the model - self.model = lgb.train( - self.params, - dtrain, - num_boost_round=num_boost_round, - valid_sets=[dtrain, dvalid], - valid_names=["train", "valid"], - early_stopping_rounds=early_stopping_rounds, - verbose_eval=verbose_eval, - evals_result=evals_result, - **kwargs - ) + + .. code-block:: Python + + def fit(self, dataset: DatasetH, num_boost_round = 1000, **kwargs): + + # prepare dataset for lgb training and evaluation + df_train, df_valid = dataset.prepare( + ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L + ) + x_train, y_train = df_train["feature"], df_train["label"] + x_valid, y_valid = df_valid["feature"], df_valid["label"] + + # Lightgbm need 1D array as its label + if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: + y_train, y_valid = np.squeeze(y_train.values), np.squeeze(y_valid.values) + else: + raise ValueError("LightGBM doesn't support multi-label training") + + dtrain = lgb.Dataset(x_train.values, label=y_train) + dvalid = lgb.Dataset(x_valid.values, label=y_valid) + + # fit the model + self.model = lgb.train( + self.params, + dtrain, + num_boost_round=num_boost_round, + valid_sets=[dtrain, dvalid], + valid_names=["train", "valid"], + early_stopping_rounds=early_stopping_rounds, + verbose_eval=verbose_eval, + evals_result=evals_result, + **kwargs + ) - Override the `predict` method - The parameters must include the parameter `dataset`, which will be userd to get the test dataset. - Return the `prediction score`. - Please refer to `Model API <../reference/api.html#module-qlib.model.base>`_ for the parameter types of the fit method. - Code Example: In the following example, users need to use `LightGBM` to predict the label(such as `preds`) of test data `x_test` and return it. - .. code-block:: Python - def predict(self, dataset: DatasetH, **kwargs)-> pandas.Series: - if self.model is None: - raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I) - return pd.Series(self.model.predict(x_test.values), index=x_test.index) + .. code-block:: Python + + def predict(self, dataset: DatasetH, **kwargs)-> pandas.Series: + if self.model is None: + raise ValueError("model is not fitted yet!") + x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I) + return pd.Series(self.model.predict(x_test.values), index=x_test.index) - Override the `finetune` method (Optional) - This method is optional to the users. When users want to use this method on their own models, they should inherit the ``ModelFT`` base class, which includes the interface of `finetune`. - The parameters must include the parameter `dataset`. - Code Example: In the following example, users will use `LightGBM` as the model and finetune it. - .. code-block:: Python - - def finetune(self, dataset: DatasetH, num_boost_round=10, verbose_eval=20): - # Based on existing model and finetune by train more rounds - dtrain, _ = self._prepare_data(dataset) - self.model = lgb.train( - self.params, - dtrain, - num_boost_round=num_boost_round, - init_model=self.model, - valid_sets=[dtrain], - valid_names=["train"], - verbose_eval=verbose_eval, - ) + + .. code-block:: Python + + def finetune(self, dataset: DatasetH, num_boost_round=10, verbose_eval=20): + # Based on existing model and finetune by train more rounds + dtrain, _ = self._prepare_data(dataset) + self.model = lgb.train( + self.params, + dtrain, + num_boost_round=num_boost_round, + init_model=self.model, + valid_sets=[dtrain], + valid_names=["train"], + verbose_eval=verbose_eval, + ) Configuration File ================== @@ -107,21 +111,21 @@ The configuration file is described in detail in the `Workflow <../component/wor - Example: The following example describes the `model` field of configuration file about the custom lightgbm model mentioned above, where `module_path` is the module path, `class` is the class name, and `args` is the hyperparameter passed into the __init__ method. All parameters in the field is passed to `self._params` by `\*\*kwargs` in `__init__` except `loss = mse`. -.. code-block:: YAML - - model: - class: LGBModel - module_path: qlib.contrib.model.gbdt - args: - loss: mse - colsample_bytree: 0.8879 - learning_rate: 0.0421 - subsample: 0.8789 - lambda_l1: 205.6999 - lambda_l2: 580.9768 - max_depth: 8 - num_leaves: 210 - num_threads: 20 + .. code-block:: YAML + + model: + class: LGBModel + module_path: qlib.contrib.model.gbdt + args: + loss: mse + colsample_bytree: 0.8879 + learning_rate: 0.0421 + subsample: 0.8789 + lambda_l1: 205.6999 + lambda_l2: 580.9768 + max_depth: 8 + num_leaves: 210 + num_threads: 20 Users could find configuration file of the baselines of the ``Model`` in ``examples/benchmarks``. All the configurations of different models are listed under the corresponding model folder. From 8802653bb92339c78dc1b6dda5aa60cdb7e92160 Mon Sep 17 00:00:00 2001 From: Maxim Smolskiy Date: Mon, 14 Nov 2022 13:53:25 +0300 Subject: [PATCH 06/15] Fix the Warnings with duplicate object description when building Qlib's documentation (#1353) * Add :noindex: to docs/advanced/task_management.rst * Add :noindex: to docs/component/data.rst * Add :noindex: to docs/component/model.rst * Add :noindex: to docs/component/online.rst * Add :noindex: to docs/component/recorder.rst * Add :noindex: to docs/component/report.rst * Retest --- docs/advanced/task_management.rst | 4 ++++ docs/component/data.rst | 7 +++++++ docs/component/model.rst | 1 + docs/component/online.rst | 4 ++++ docs/component/recorder.rst | 3 +++ docs/component/report.rst | 4 ++++ 6 files changed, 23 insertions(+) diff --git a/docs/advanced/task_management.rst b/docs/advanced/task_management.rst index 70b6bcfc860..b1cb6c696a5 100644 --- a/docs/advanced/task_management.rst +++ b/docs/advanced/task_management.rst @@ -31,6 +31,7 @@ Here is the base class of ``TaskGen``: .. autoclass:: qlib.workflow.task.gen.TaskGen :members: + :noindex: ``Qlib`` provides a class `RollingGen `_ to generate a list of ``task`` of the dataset in different date segments. This class allows users to verify the effect of data from different periods on the model in one experiment. More information is `here <../reference/api.html#TaskGen>`__. @@ -53,6 +54,7 @@ Users need to provide the MongoDB URL and database name for using ``TaskManager` .. autoclass:: qlib.workflow.task.manage.TaskManager :members: + :noindex: More information of ``Task Manager`` can be found in `here <../reference/api.html#TaskManager>`__. @@ -64,11 +66,13 @@ An easy way to get the ``task_func`` is using ``qlib.model.trainer.task_train`` It will run the whole workflow defined by ``task``, which includes *Model*, *Dataset*, *Record*. .. autofunction:: qlib.workflow.task.manage.run_task + :noindex: Meanwhile, ``Qlib`` provides a module called ``Trainer``. .. autoclass:: qlib.model.trainer.Trainer :members: + :noindex: ``Trainer`` will train a list of tasks and return a list of model recorders. ``Qlib`` offer two kinds of Trainer, TrainerR is the simplest way and TrainerRM is based on TaskManager to help manager tasks lifecycle automatically. diff --git a/docs/component/data.rst b/docs/component/data.rst index d3b8cafed21..60e8d4fa1bd 100644 --- a/docs/component/data.rst +++ b/docs/component/data.rst @@ -332,6 +332,7 @@ Here are some interfaces of the ``QlibDataLoader`` class: .. autoclass:: qlib.data.dataset.loader.DataLoader :members: + :noindex: API --- @@ -361,6 +362,7 @@ Here are some important interfaces that ``DataHandlerLP`` provides: .. autoclass:: qlib.data.dataset.handler.DataHandlerLP :members: __init__, fetch, get_cols + :noindex: If users want to load features and labels by config, users can define a new handler and call the static method `parse_config_to_fields` of ``qlib.contrib.data.handler.Alpha158``. @@ -451,6 +453,7 @@ The ``DatasetH`` class is the `dataset` with `Data Handler`. Here is the most im .. autoclass:: qlib.data.dataset.__init__.DatasetH :members: + :noindex: API --- @@ -470,9 +473,11 @@ Global Memory Cache .. autoclass:: qlib.data.cache.MemCacheUnit :members: + :noindex: .. autoclass:: qlib.data.cache.MemCache :members: + :noindex: ExpressionCache @@ -487,6 +492,7 @@ The following shows the details about the interfaces: .. autoclass:: qlib.data.cache.ExpressionCache :members: + :noindex: ``Qlib`` has currently provided implemented disk cache `DiskExpressionCache` which inherits from `ExpressionCache` . The expressions data will be stored in the disk. @@ -502,6 +508,7 @@ The following shows the details about the interfaces: .. autoclass:: qlib.data.cache.DatasetCache :members: + :noindex: ``Qlib`` has currently provided implemented disk cache `DiskDatasetCache` which inherits from `DatasetCache` . The datasets' data will be stored in the disk. diff --git a/docs/component/model.rst b/docs/component/model.rst index 111f6402948..e0c630ccaac 100644 --- a/docs/component/model.rst +++ b/docs/component/model.rst @@ -20,6 +20,7 @@ The base class provides the following interfaces: .. autoclass:: qlib.model.base.Model :members: + :noindex: ``Qlib`` also provides a base class `qlib.model.base.ModelFT <../reference/api.html#qlib.model.base.ModelFT>`_, which includes the method for finetuning the model. diff --git a/docs/component/online.rst b/docs/component/online.rst index 351098db4d4..d7113c19fcf 100644 --- a/docs/component/online.rst +++ b/docs/component/online.rst @@ -32,21 +32,25 @@ Online Manager .. automodule:: qlib.workflow.online.manager :members: + :noindex: Online Strategy =============== .. automodule:: qlib.workflow.online.strategy :members: + :noindex: Online Tool =========== .. automodule:: qlib.workflow.online.utils :members: + :noindex: Updater ======= .. automodule:: qlib.workflow.online.update :members: + :noindex: diff --git a/docs/component/recorder.rst b/docs/component/recorder.rst index ed5e4762bce..ca545b75bd1 100644 --- a/docs/component/recorder.rst +++ b/docs/component/recorder.rst @@ -61,6 +61,7 @@ The ``ExpManager`` module in ``Qlib`` is responsible for managing different expe .. autoclass:: qlib.workflow.expm.ExpManager :members: get_exp, list_experiments + :noindex: For other interfaces such as `create_exp`, `delete_exp`, please refer to `Experiment Manager API <../reference/api.html#experiment-manager>`_. @@ -71,6 +72,7 @@ The ``Experiment`` class is solely responsible for a single experiment, and it w .. autoclass:: qlib.workflow.exp.Experiment :members: get_recorder, list_recorders + :noindex: For other interfaces such as `search_records`, `delete_recorder`, please refer to `Experiment API <../reference/api.html#experiment>`_. @@ -85,6 +87,7 @@ Here are some important APIs that are not included in the ``QlibRecorder``: .. autoclass:: qlib.workflow.recorder.Recorder :members: list_artifacts, list_metrics, list_params, list_tags + :noindex: For other interfaces such as `save_objects`, `load_object`, please refer to `Recorder API <../reference/api.html#recorder>`_. diff --git a/docs/component/report.rst b/docs/component/report.rst index 30fca078836..01d3a21234d 100644 --- a/docs/component/report.rst +++ b/docs/component/report.rst @@ -51,6 +51,7 @@ API .. automodule:: qlib.contrib.report.analysis_position.report :members: + :noindex: Graphical Result ~~~~~~~~~~~~~~~~ @@ -93,6 +94,7 @@ API .. automodule:: qlib.contrib.report.analysis_position.score_ic :members: + :noindex: Graphical Result @@ -151,6 +153,7 @@ API .. automodule:: qlib.contrib.report.analysis_position.risk_analysis :members: + :noindex: Graphical Result @@ -270,6 +273,7 @@ API .. automodule:: qlib.contrib.report.analysis_model.analysis_model_performance :members: + :noindex: Graphical Results From b51e881be3075c72f6fce30643a39b368a76e368 Mon Sep 17 00:00:00 2001 From: Maxim Smolskiy Date: Tue, 15 Nov 2022 03:49:36 +0300 Subject: [PATCH 07/15] Fix the Errors with unexpected indentation when building Qlib's documentation (#1352) * Fix ERROR: Unexpected indentation in qlib/data/dataset/handler.py * Fix ERROR: Unexpected indentation in qlib/data/dataset/__init__.py * Fix ERROR: Unexpected indentation in ../qlib/data/cache.py * Fix ERROR: Unexpected indentation in qlib/model/meta/task.py * Fix ERROR: Unexpected indentation in qlib/model/meta/dataset.py * Fix ERROR: Unexpected indentation in qlib/workflow/online/manager.py * Fix ERROR: Unexpected indentation in qlib/workflow/online/update.py * Fix ERROR: Unexpected indentation in /qlib/workflow/__init__.py * Fix ERROR: Unexpected indentation in qlib/data/base.py * Fix ERROR: Unexpected indentation in qlib/data/dataset/loader.py * Fix ERROR: Unexpected indentation in qlib/contrib/evaluate.py * Fix ERROR: Unexpected indentation in qlib/workflow/record_temp.py * Fix ERROR: Unexpected indentation in qlib/workflow/task/gen.py * Fix ERROR: Unexpected indentation in qlib/strategy/base.py * Fix qlib/data/dataset/handler.py * Retest --- qlib/contrib/evaluate.py | 8 ++++++-- qlib/data/base.py | 5 +++++ qlib/data/cache.py | 2 +- qlib/data/dataset/__init__.py | 5 +++-- qlib/data/dataset/handler.py | 32 +++++++++++++++++++++++++------- qlib/data/dataset/loader.py | 2 ++ qlib/model/meta/dataset.py | 4 ++++ qlib/model/meta/task.py | 5 ++++- qlib/strategy/base.py | 6 +++++- qlib/workflow/__init__.py | 1 + qlib/workflow/online/manager.py | 2 ++ qlib/workflow/online/update.py | 27 ++++++++++++++++++--------- qlib/workflow/record_temp.py | 2 ++ qlib/workflow/task/gen.py | 2 ++ 14 files changed, 80 insertions(+), 23 deletions(-) diff --git a/qlib/contrib/evaluate.py b/qlib/contrib/evaluate.py index 2901a40eae6..8e5cfd4fb57 100644 --- a/qlib/contrib/evaluate.py +++ b/qlib/contrib/evaluate.py @@ -187,9 +187,13 @@ def backtest_daily( the benchmark for reporting. account : Union[float, int, Position] information for describing how to creating the account + For `float` or `int`: + Using Account with only initial cash + For `Position`: + Using Account with a Position exchange_kwargs : dict the kwargs for initializing Exchange @@ -283,8 +287,8 @@ def long_short_backtest( NOTE: This will be faster with offline qlib. :return: The result of backtest, it is represented by a dict. { "long": long_returns(excess), - "short": short_returns(excess), - "long_short": long_short_returns} + "short": short_returns(excess), + "long_short": long_short_returns} """ if get_level_index(pred, level="datetime") == 1: pred = pred.swaplevel().sort_index() diff --git a/qlib/data/base.py b/qlib/data/base.py index cf32d333f7f..496ae38ee23 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -16,8 +16,10 @@ class Expression(abc.ABC): Expression is designed to handle the calculation of data with the format below data with two dimension for each instrument, + - feature - time: it could be observation time or period time. + - period time is designed for Point-in-time database. For example, the period time maybe 2014Q4, its value can observed for multiple times(different value may be observed at different time due to amendment). """ @@ -142,9 +144,12 @@ def load(self, instrument, start_index, end_index, *args): This function is responsible for loading feature/expression based on the expression engine. The concrete implementation will be separated into two parts: + 1) caching data, handle errors. + - This part is shared by all the expressions and implemented in Expression 2) processing and calculating data based on the specific expression. + - This part is different in each expression and implemented in each expression Expression Engine is shared by different data. diff --git a/qlib/data/cache.py b/qlib/data/cache.py index 7c692377ad6..e7336e8bed4 100644 --- a/qlib/data/cache.py +++ b/qlib/data/cache.py @@ -394,7 +394,7 @@ def dataset( .. note:: The server use redis_lock to make sure read-write conflicts will not be triggered - but client readers are not considered. + but client readers are not considered. """ if disk_cache == 0: # skip cache diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index dcc9957ed63..286418bcf8a 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -205,8 +205,9 @@ def prepare( col_set : str The col_set will be passed to self.handler when fetching data. TODO: make it automatic: - - select DK_I for test data - - select DK_L for training data. + + - select DK_I for test data + - select DK_L for training data. data_key : str The data to fetch: DK_* Default is DK_I, which indicate fetching data for **inference**. diff --git a/qlib/data/dataset/handler.py b/qlib/data/dataset/handler.py index bb44cd893a8..5d73ac6cea4 100644 --- a/qlib/data/dataset/handler.py +++ b/qlib/data/dataset/handler.py @@ -160,13 +160,17 @@ def fetch( selector : Union[pd.Timestamp, slice, str] describe how to select data by index It can be categories as following + - fetch single index - fetch a range of index + - a slice range - pd.Index for specific indexes Following conflictions may occurs - - Does [20200101", "20210101"] mean selecting this slice or these two days? + + - Does ["20200101", "20210101"] mean selecting this slice or these two days? + - slice have higher priorities level : Union[str, int] @@ -178,7 +182,8 @@ def fetch( select a set of meaningful, pd.Index columns.(e.g. features, columns) - if col_set == CS_RAW: + - if col_set == CS_RAW: + the raw dataset will be returned. - if isinstance(col_set, List[str]): @@ -186,8 +191,10 @@ def fetch( select several sets of meaningful columns, the returned data has multiple levels proc_func: Callable + - Give a hook for processing data before fetching - An example to explain the necessity of the hook: + - A Dataset learned some processors to process data which is related to data segmentation - It will apply them every time when preparing data. - The learned processor require the dataframe remains the same format when fitting and applying @@ -326,18 +333,23 @@ class DataHandlerLP(DataHandler): DataHandler with **(L)earnable (P)rocessor** This handler will produce three pieces of data in pd.DataFrame format. + - DK_R / self._data: the raw data loaded from the loader - DK_I / self._infer: the data processed for inference - DK_L / self._learn: the data processed for learning model. The motivation of using different processor workflows for learning and inference Here are some examples. + - The instrument universe for learning and inference may be different. - The processing of some samples may rely on label (for example, some samples hit the limit may need extra processing or be dropped). - These processors only apply to the learning phase. + + - These processors only apply to the learning phase. Tips to improve the performance of data handler + - To reduce the memory cost + - `drop_raw=True`: this will modify the data inplace on raw data; """ @@ -482,12 +494,18 @@ def process_data(self, with_fit: bool = False): Notation: (data) [processor] # data processing flow of self.process_type == DataHandlerLP.PTYPE_I - (self._data)-[shared_processors]-(_shared_df)-[learn_processors]-(_learn_df) - \ - -[infer_processors]-(_infer_df) + + .. code-block:: text + + (self._data)-[shared_processors]-(_shared_df)-[learn_processors]-(_learn_df) + \\ + -[infer_processors]-(_infer_df) # data processing flow of self.process_type == DataHandlerLP.PTYPE_A - (self._data)-[shared_processors]-(_shared_df)-[infer_processors]-(_infer_df)-[learn_processors]-(_learn_df) + + .. code-block:: text + + (self._data)-[shared_processors]-(_shared_df)-[infer_processors]-(_infer_df)-[learn_processors]-(_learn_df) Parameters ---------- diff --git a/qlib/data/dataset/loader.py b/qlib/data/dataset/loader.py index c80d60bab8a..074cfa6084f 100644 --- a/qlib/data/dataset/loader.py +++ b/qlib/data/dataset/loader.py @@ -278,7 +278,9 @@ class DataLoaderDH(DataLoader): - If you just want to load data from single datahandler, you can write them in single data handler TODO: What make this module not that easy to use. + - For online scenario + - The underlayer data handler should be configured. But data loader doesn't provide such interface & hook. """ diff --git a/qlib/model/meta/dataset.py b/qlib/model/meta/dataset.py index 8238428978e..34a9b949b31 100644 --- a/qlib/model/meta/dataset.py +++ b/qlib/model/meta/dataset.py @@ -12,11 +12,15 @@ class MetaTaskDataset(Serializable, metaclass=abc.ABCMeta): A dataset fetching the data in a meta-level. A Meta Dataset is responsible for + - input tasks(e.g. Qlib tasks) and prepare meta tasks + - meta task contains more information than normal tasks (e.g. input data for meta model) The learnt pattern could transfer to other meta dataset. The following cases should be supported + - A meta-model trained on meta-dataset A and then applied to meta-dataset B + - Some pattern are shared between meta-dataset A and B, so meta-input on meta-dataset A are used when meta model are applied on meta-dataset-B """ diff --git a/qlib/model/meta/task.py b/qlib/model/meta/task.py index f59198830d3..3204910010e 100644 --- a/qlib/model/meta/task.py +++ b/qlib/model/meta/task.py @@ -11,9 +11,11 @@ class MetaTask: It serves as a component as in MetaDatasetDS The data processing is different + - the processed input may be different between training and testing + - When training, the X, y, X_test, y_test in training tasks are necessary (# PROC_MODE_FULL #) - but not necessary in test tasks. (# PROC_MODE_TEST #) + but not necessary in test tasks. (# PROC_MODE_TEST #) - When the meta model can be transferred into other dataset, only meta_info is necessary (# PROC_MODE_TRANSFER #) """ @@ -24,6 +26,7 @@ class MetaTask: def __init__(self, task: dict, meta_info: object, mode: str = PROC_MODE_FULL): """ The `__init__` func is responsible for + - store the task - store the origin input data for - process the input data for meta data diff --git a/qlib/strategy/base.py b/qlib/strategy/base.py index 532e88452ed..a9e138fdbb7 100644 --- a/qlib/strategy/base.py +++ b/qlib/strategy/base.py @@ -36,6 +36,7 @@ def __init__( outer_trade_decision : BaseTradeDecision, optional the trade decision of outer strategy which this strategy relies, and it will be traded in [start_time, end_time], by default None + - If the strategy is used to split trade decision, it will be used - If the strategy is used for portfolio management, it can be ignored level_infra : LevelInfrastructure, optional @@ -45,11 +46,13 @@ def __init__( trade_exchange : Exchange exchange that provides market info, used to deal order and generate report + - If `trade_exchange` is None, self.trade_exchange will be set with common_infra - It allows different trade_exchanges is used in different executions. - For example: + - In daily execution, both daily exchange and minutely are usable, but the daily exchange is - recommended because it run faster. + recommended because it run faster. - In minutely execution, the daily exchange is not usable, only the minutely exchange is recommended. """ @@ -137,6 +140,7 @@ def generate_trade_decision( ---------- execute_result : List[object], optional the executed result for trade decision, by default None + - When call the generate_trade_decision firstly, `execute_result` could be None """ raise NotImplementedError("generate_trade_decision is not implemented!") diff --git a/qlib/workflow/__init__.py b/qlib/workflow/__init__.py index 220949c143b..aecf0ac9926 100644 --- a/qlib/workflow/__init__.py +++ b/qlib/workflow/__init__.py @@ -350,6 +350,7 @@ def set_uri(self, uri: Optional[Text]): Method to reset the current uri of current experiment manager. NOTE: + - When the uri is refer to a file path, please using the absolute path instead of strings like "~/mlruns/" The backend don't support strings like this. """ diff --git a/qlib/workflow/online/manager.py b/qlib/workflow/online/manager.py index aeeb111b27a..9a085ace513 100644 --- a/qlib/workflow/online/manager.py +++ b/qlib/workflow/online/manager.py @@ -78,7 +78,9 @@ # Can we simplify current workflow? + - Can reduce the number of state of tasks? + - For each task, we have three phases (i.e. task, partly trained task, final trained task) """ diff --git a/qlib/workflow/online/update.py b/qlib/workflow/online/update.py index 0360d69b77b..5047a1bd25e 100644 --- a/qlib/workflow/online/update.py +++ b/qlib/workflow/online/update.py @@ -82,19 +82,23 @@ def update(self, *args, **kwargs): class DSBasedUpdater(RecordUpdater, metaclass=ABCMeta): """ Dataset-Based Updater + - Providing updating feature for Updating data based on Qlib Dataset Assumption + - Based on Qlib dataset - - The data to be updated is a multi-level index pd.DataFrame. For example label , prediction. - - LABEL0 - datetime instrument - 2021-05-10 SH600000 0.006965 - SH600004 0.003407 - ... ... - 2021-05-28 SZ300498 0.015748 - SZ300676 -0.001321 + - The data to be updated is a multi-level index pd.DataFrame. For example label, prediction. + + .. code-block:: + + LABEL0 + datetime instrument + 2021-05-10 SH600000 0.006965 + SH600004 0.003407 + ... ... + 2021-05-28 SZ300498 0.015748 + SZ300676 -0.001321 """ def __init__( @@ -111,6 +115,7 @@ def __init__( Init PredUpdater. Expected behavior in following cases: + - if `to_date` is greater than the max date in the calendar, the data will be updated to the latest date - if there are data before `from_date` or after `to_date`, only the data between `from_date` and `to_date` are affected. @@ -118,11 +123,15 @@ def __init__( record : Recorder to_date : update to prediction to the `to_date` + if to_date is None: + data will updated to the latest date. from_date : the update will start from `from_date` + if from_date is None: + the updating will occur on the next tick after the latest data in historical data hist_ref : int Sometimes, the dataset will have historical depends. diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index 5f62e775891..2831482104f 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -349,7 +349,9 @@ class PortAnaRecord(ACRecordTemp): This is the Portfolio Analysis Record class that generates the analysis results such as those of backtest. This class inherits the ``RecordTemp`` class. The following files will be stored in recorder + - report_normal.pkl & positions_normal.pkl: + - The return report and detailed positions of the backtest, returned by `qlib/contrib/evaluate.py:backtest` - port_analysis.pkl : The risk analysis of your portfolio, returned by `qlib/contrib/evaluate.py:risk_analysis` """ diff --git a/qlib/workflow/task/gen.py b/qlib/workflow/task/gen.py index 7ef7b4ed959..77bd2cbc111 100644 --- a/qlib/workflow/task/gen.py +++ b/qlib/workflow/task/gen.py @@ -94,7 +94,9 @@ def handler_mod(task: dict, rolling_gen): """ Help to modify the handler end time when using RollingGen It try to handle the following case + - Hander's data end_time is earlier than dataset's test_data's segments. + - To handle this, handler's data's end_time is extended. If the handler's end_time is None, then it is not necessary to change it's end time. From 994f89319d86302c1ba09d41d02b957c856d3867 Mon Sep 17 00:00:00 2001 From: you-n-g Date: Fri, 18 Nov 2022 13:11:31 +0800 Subject: [PATCH 08/15] Optimize the implementation of uri & Fix async log bug (#1364) * Optimize the implementation of uri * remove redundant func * Set the right order of _set_client_uri * Update qlib/workflow/expm.py * Simplify client & add test.Add docs; Fix async bug * Fix comments & pylint * Improve README --- qlib/workflow/__init__.py | 9 +-- qlib/workflow/exp.py | 1 - qlib/workflow/expm.py | 105 ++++++++++++-------------- qlib/workflow/recorder.py | 3 +- setup.py | 2 +- tests/dependency_tests/README.md | 3 + tests/dependency_tests/test_mlflow.py | 34 +++++++++ 7 files changed, 94 insertions(+), 63 deletions(-) create mode 100644 tests/dependency_tests/README.md create mode 100644 tests/dependency_tests/test_mlflow.py diff --git a/qlib/workflow/__init__.py b/qlib/workflow/__init__.py index aecf0ac9926..d14782c60d3 100644 --- a/qlib/workflow/__init__.py +++ b/qlib/workflow/__init__.py @@ -8,7 +8,6 @@ from .recorder import Recorder from ..utils import Wrapper from ..utils.exceptions import RecorderInitializationError -from qlib.config import C class QlibRecorder: @@ -347,14 +346,14 @@ def get_uri(self): def set_uri(self, uri: Optional[Text]): """ - Method to reset the current uri of current experiment manager. + Method to reset the **default** uri of current experiment manager. NOTE: - When the uri is refer to a file path, please using the absolute path instead of strings like "~/mlruns/" The backend don't support strings like this. """ - self.exp_manager.set_uri(uri) + self.exp_manager.default_uri = uri @contextmanager def uri_context(self, uri: Text): @@ -370,11 +369,11 @@ def uri_context(self, uri: Text): the temporal uri """ prev_uri = self.exp_manager.default_uri - C.exp_manager["kwargs"]["uri"] = uri + self.exp_manager.default_uri = uri try: yield finally: - C.exp_manager["kwargs"]["uri"] = prev_uri + self.exp_manager.default_uri = prev_uri def get_recorder( self, diff --git a/qlib/workflow/exp.py b/qlib/workflow/exp.py index d3dd0a535df..95e5db47380 100644 --- a/qlib/workflow/exp.py +++ b/qlib/workflow/exp.py @@ -249,7 +249,6 @@ class MLflowExperiment(Experiment): def __init__(self, id, name, uri): super(MLflowExperiment, self).__init__(id, name) self._uri = uri - self._default_name = None self._default_rec_name = "mlflow_recorder" self._client = mlflow.tracking.MlflowClient(tracking_uri=self._uri) diff --git a/qlib/workflow/expm.py b/qlib/workflow/expm.py index 419848517da..3aaa574dd2f 100644 --- a/qlib/workflow/expm.py +++ b/qlib/workflow/expm.py @@ -15,23 +15,32 @@ from ..log import get_module_logger from ..utils.exceptions import ExpAlreadyExistError + logger = get_module_logger("workflow") class ExpManager: """ - This is the `ExpManager` class for managing experiments. The API is designed similar to mlflow. - (The link: https://mlflow.org/docs/latest/python_api/mlflow.html) + This is the `ExpManager` class for managing experiments. The API is designed similar to mlflow. + (The link: https://mlflow.org/docs/latest/python_api/mlflow.html) + + The `ExpManager` is expected to be a singleton (btw, we can have multiple `Experiment`s with different uri. user can get different experiments from different uri, and then compare records of them). Global Config (i.e. `C`) is also a singleton. + So we try to align them together. They share the same variable, which is called **default uri**. Please refer to `ExpManager.default_uri` for details of variable sharing. + + When the user starts an experiment, the user may want to set the uri to a specific uri (it will override **default uri** during this period), and then unset the **specific uri** and fallback to the **default uri**. `ExpManager._active_exp_uri` is that **specific uri**. """ + active_experiment: Optional[Experiment] + def __init__(self, uri: Text, default_exp_name: Optional[Text]): - self._current_uri = uri + self.default_uri = uri + self._active_exp_uri = None # No active experiments. So it is set to None self._default_exp_name = default_exp_name self.active_experiment = None # only one experiment can be active each time - logger.info(f"experiment manager uri is at {self._current_uri}") + logger.info(f"experiment manager uri is at {self.uri}") def __repr__(self): - return "{name}(current_uri={curi})".format(name=self.__class__.__name__, curi=self._current_uri) + return "{name}(uri={uri})".format(name=self.__class__.__name__, uri=self.uri) def start_exp( self, @@ -43,11 +52,13 @@ def start_exp( uri: Optional[Text] = None, resume: bool = False, **kwargs, - ): + ) -> Experiment: """ Start an experiment. This method includes first get_or_create an experiment, and then set it to be active. + Maintaining `_active_exp_uri` is included in start_exp, remaining implementation should be included in _end_exp in subclass + Parameters ---------- experiment_id : str @@ -67,12 +78,28 @@ def start_exp( ------- An active experiment. """ + self._active_exp_uri = uri + # The subclass may set the underlying uri back. + # So setting `_active_exp_uri` come before `_start_exp` + return self._start_exp( + experiment_id=experiment_id, + experiment_name=experiment_name, + recorder_id=recorder_id, + recorder_name=recorder_name, + resume=resume, + **kwargs, + ) + + def _start_exp(self, *args, **kwargs) -> Experiment: + """Please refer to the doc of `start_exp`""" raise NotImplementedError(f"Please implement the `start_exp` method.") def end_exp(self, recorder_status: Text = Recorder.STATUS_S, **kwargs): """ End an active experiment. + Maintaining `_active_exp_uri` is included in end_exp, remaining implementation should be included in _end_exp in subclass + Parameters ---------- experiment_name : str @@ -80,6 +107,12 @@ def end_exp(self, recorder_status: Text = Recorder.STATUS_S, **kwargs): recorder_status : str the status of the active recorder of the experiment. """ + self._active_exp_uri = None + # The subclass may set the underlying uri back. + # So setting `_active_exp_uri` come before `_end_exp` + self._end_exp(recorder_status=recorder_status, **kwargs) + + def _end_exp(self, recorder_status: Text = Recorder.STATUS_S, **kwargs): raise NotImplementedError(f"Please implement the `end_exp` method.") def create_exp(self, experiment_name: Optional[Text] = None): @@ -254,6 +287,10 @@ def default_uri(self): raise ValueError("The default URI is not set in qlib.config.C") return C.exp_manager["kwargs"]["uri"] + @default_uri.setter + def default_uri(self, value): + C.exp_manager.setdefault("kwargs", {})["uri"] = value + @property def uri(self): """ @@ -263,33 +300,7 @@ def uri(self): ------- The tracking URI string. """ - return self._current_uri or self.default_uri - - def set_uri(self, uri: Optional[Text] = None): - """ - Set the current tracking URI and the corresponding variables. - - Parameters - ---------- - uri : str - - """ - if uri is None: - if self._current_uri is None: - logger.debug("No tracking URI is provided. Use the default tracking URI.") - self._current_uri = self.default_uri - else: - # Temporarily re-set the current uri as the uri argument. - self._current_uri = uri - # Customized features for subclasses. - self._set_uri() - - def _set_uri(self): - """ - Customized features for subclasses' set_uri function. - This method is designed for the underlying experiment backend storage. - """ - raise NotImplementedError(f"Please implement the `_set_uri` method.") + return self._active_exp_uri or self.default_uri def list_experiments(self): """ @@ -307,33 +318,21 @@ class MLflowExpManager(ExpManager): Use mlflow to implement ExpManager. """ - def __init__(self, uri: Text, default_exp_name: Optional[Text]): - super(MLflowExpManager, self).__init__(uri, default_exp_name) - self._client = None - - def _set_uri(self): - self._client = mlflow.tracking.MlflowClient(tracking_uri=self.uri) - logger.info("{:}".format(self._client)) - @property def client(self): - # Delay the creation of mlflow client in case of creating `mlruns` folder when importing qlib - if self._client is None: - self._client = mlflow.tracking.MlflowClient(tracking_uri=self.uri) - return self._client + # Please refer to `tests/dependency_tests/test_mlflow.py::MLflowTest::test_creating_client` + # The test ensure the speed of create a new client + return mlflow.tracking.MlflowClient(tracking_uri=self.uri) - def start_exp( + def _start_exp( self, *, experiment_id: Optional[Text] = None, experiment_name: Optional[Text] = None, recorder_id: Optional[Text] = None, recorder_name: Optional[Text] = None, - uri: Optional[Text] = None, resume: bool = False, ): - # Set the tracking uri - self.set_uri(uri) # Create experiment if experiment_name is None: experiment_name = self._default_exp_name @@ -345,12 +344,10 @@ def start_exp( return self.active_experiment - def end_exp(self, recorder_status: Text = Recorder.STATUS_S): + def _end_exp(self, recorder_status: Text = Recorder.STATUS_S): if self.active_experiment is not None: self.active_experiment.end(recorder_status) self.active_experiment = None - # When an experiment end, we will release the current uri. - self._current_uri = None def create_exp(self, experiment_name: Optional[Text] = None): assert experiment_name is not None @@ -362,9 +359,7 @@ def create_exp(self, experiment_name: Optional[Text] = None): raise ExpAlreadyExistError() from e raise e - experiment = MLflowExperiment(experiment_id, experiment_name, self.uri) - experiment._default_name = self._default_exp_name - return experiment + return MLflowExperiment(experiment_id, experiment_name, self.uri) def _get_exp(self, experiment_id=None, experiment_name=None): """ diff --git a/qlib/workflow/recorder.py b/qlib/workflow/recorder.py index 1b46466013b..9d82bf0a47f 100644 --- a/qlib/workflow/recorder.py +++ b/qlib/workflow/recorder.py @@ -378,14 +378,15 @@ def end_run(self, status: str = Recorder.STATUS_S): Recorder.STATUS_FI, Recorder.STATUS_FA, ], f"The status type {status} is not supported." - mlflow.end_run(status) self.end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") if self.status != Recorder.STATUS_S: self.status = status if self.async_log is not None: + # Waiting Queue should go before mlflow.end_run. Otherwise mlflow will raise error with TimeInspector.logt("waiting `async_log`"): self.async_log.wait() self.async_log = None + mlflow.end_run(status) def save_objects(self, local_path=None, artifact_path=None, **kwargs): assert self.uri is not None, "Please start the experiment and recorder first before using recorder directly." diff --git a/setup.py b/setup.py index a796ecf4b7a..faf058d6315 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ def get_version(rel_path: str) -> str: "matplotlib>=3.3", "tables>=3.6.1", "pyyaml>=5.3.1", - "mlflow>=1.12.1", + "mlflow>=1.12.1, <=1.30.0", "tqdm", "loguru", "lightgbm>=3.3.0", diff --git a/tests/dependency_tests/README.md b/tests/dependency_tests/README.md new file mode 100644 index 00000000000..544fac130a3 --- /dev/null +++ b/tests/dependency_tests/README.md @@ -0,0 +1,3 @@ +Some implementations of Qlib depend on some assumptions of its dependencies. + +So some tests are requried to ensure that these assumptions are valid. diff --git a/tests/dependency_tests/test_mlflow.py b/tests/dependency_tests/test_mlflow.py new file mode 100644 index 00000000000..94f164a3577 --- /dev/null +++ b/tests/dependency_tests/test_mlflow.py @@ -0,0 +1,34 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +import unittest +import mlflow +import time +from pathlib import Path +import shutil + + +class MLflowTest(unittest.TestCase): + TMP_PATH = Path("./.mlruns_tmp/") + + def tearDown(self) -> None: + if self.TMP_PATH.exists(): + shutil.rmtree(self.TMP_PATH) + + def test_creating_client(self): + """ + Please refer to qlib/workflow/expm.py:MLflowExpManager._client + we don't cache _client (this is helpful to reduce maintainance work when MLflowExpManager's uri is chagned) + + This implementation is based on the assumption creating a client is fast + """ + start = time.time() + for i in range(10): + _ = mlflow.tracking.MlflowClient(tracking_uri=str(self.TMP_PATH)) + end = time.time() + elasped = end - start + self.assertLess(elasped, 1e-2) # it can be done in less than 10ms + print(elasped) + + +if __name__ == "__main__": + unittest.main() From e47b0f1c500d808aa5599e0ffe544abffa00eff2 Mon Sep 17 00:00:00 2001 From: Chia-hung Tai Date: Sat, 19 Nov 2022 11:52:34 +0800 Subject: [PATCH 09/15] Fix typo. (#1365) --- tests/dependency_tests/test_mlflow.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/dependency_tests/test_mlflow.py b/tests/dependency_tests/test_mlflow.py index 94f164a3577..578376a8577 100644 --- a/tests/dependency_tests/test_mlflow.py +++ b/tests/dependency_tests/test_mlflow.py @@ -25,9 +25,9 @@ def test_creating_client(self): for i in range(10): _ = mlflow.tracking.MlflowClient(tracking_uri=str(self.TMP_PATH)) end = time.time() - elasped = end - start - self.assertLess(elasped, 1e-2) # it can be done in less than 10ms - print(elasped) + elapsed = end - start + self.assertLess(elapsed, 1e-2) # it can be done in less than 10ms + print(elapsed) if __name__ == "__main__": From 0c4db8b0f87d8613aac965372fac7fd21c34f9c1 Mon Sep 17 00:00:00 2001 From: Chia-hung Tai Date: Sat, 19 Nov 2022 11:56:30 +0800 Subject: [PATCH 10/15] Set _artifact_uri when mlflow_run is not None. (#1367) * Set _artifact_uri when mlflow_run is not None. * Fix black. --- qlib/workflow/recorder.py | 1 + tests/test_workflow.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 tests/test_workflow.py diff --git a/qlib/workflow/recorder.py b/qlib/workflow/recorder.py index 9d82bf0a47f..33903f7ed3c 100644 --- a/qlib/workflow/recorder.py +++ b/qlib/workflow/recorder.py @@ -279,6 +279,7 @@ def __init__(self, experiment_id, uri, name=None, mlflow_run=None): if mlflow_run.info.end_time is not None else None ) + self._artifact_uri = mlflow_run.info.artifact_uri self.async_log = None def __repr__(self): diff --git a/tests/test_workflow.py b/tests/test_workflow.py new file mode 100644 index 00000000000..129abc0fbb4 --- /dev/null +++ b/tests/test_workflow.py @@ -0,0 +1,29 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +import unittest +from pathlib import Path +import shutil + +from qlib.workflow import R +from qlib.tests import TestAutoData + + +class WorkflowTest(TestAutoData): + TMP_PATH = Path("./.mlruns_tmp/") + + def tearDown(self) -> None: + if self.TMP_PATH.exists(): + shutil.rmtree(self.TMP_PATH) + + def test_get_local_dir(self): + """ """ + with R.start(uri=str(self.TMP_PATH)): + pass + + with R.uri_context(uri=str(self.TMP_PATH)): + resume_recorder = R.get_recorder() + resume_recorder.get_local_dir() + + +if __name__ == "__main__": + unittest.main() From cc01812c6259dec9c89b233a3899698c8a19acb0 Mon Sep 17 00:00:00 2001 From: YQ Tsui Date: Sun, 20 Nov 2022 14:15:59 +0800 Subject: [PATCH 11/15] Fix typos and grammar errors in docstrings and comments (#1366) * fix gramma error in doc strings * fix typos in exchange.py * fix typos and gramma errors * fix typo and rename function param to avoid shading python keyword * remove redundant parathesis; pass kwargs to parent class * fix pyblack * further correction * assign -> be assigned to --- qlib/backtest/exchange.py | 26 +++++------ qlib/contrib/data/handler.py | 45 ++++++++++--------- qlib/contrib/model/pytorch_adarnn.py | 2 +- qlib/contrib/model/pytorch_add.py | 2 +- qlib/contrib/model/pytorch_alstm.py | 2 +- qlib/contrib/model/pytorch_alstm_ts.py | 2 +- qlib/contrib/model/pytorch_gats.py | 2 +- qlib/contrib/model/pytorch_gats_ts.py | 2 +- qlib/contrib/model/pytorch_gru.py | 2 +- qlib/contrib/model/pytorch_gru_ts.py | 2 +- qlib/contrib/model/pytorch_hist.py | 2 +- qlib/contrib/model/pytorch_igmtf.py | 2 +- qlib/contrib/model/pytorch_lstm.py | 2 +- qlib/contrib/model/pytorch_lstm_ts.py | 2 +- qlib/contrib/model/pytorch_tcn.py | 2 +- qlib/contrib/model/pytorch_tcn_ts.py | 2 +- qlib/contrib/model/pytorch_tcts.py | 2 +- qlib/data/dataset/handler.py | 17 +++---- qlib/data/dataset/storage.py | 3 +- qlib/rl/utils/log.py | 6 +-- qlib/workflow/online/manager.py | 2 +- qlib/workflow/record_temp.py | 2 +- qlib/workflow/utils.py | 12 ++--- scripts/data_collector/crowd_source/README.md | 6 +-- 24 files changed, 77 insertions(+), 72 deletions(-) diff --git a/qlib/backtest/exchange.py b/qlib/backtest/exchange.py index cc760be44dd..a2cc13623b1 100644 --- a/qlib/backtest/exchange.py +++ b/qlib/backtest/exchange.py @@ -27,10 +27,10 @@ class Exchange: # `quote_df` is a pd.DataFrame class that contains basic information for backtesting - # After some processing, the data will later be maintained by `quote_cls` object for faster data retriving. + # After some processing, the data will later be maintained by `quote_cls` object for faster data retrieving. # Some conventions for `quote_df` # - $close is for calculating the total value at end of each day. - # - if $close is None, the stock on that day is reguarded as suspended. + # - if $close is None, the stock on that day is regarded as suspended. # - $factor is for rounding to the trading unit; # - if any $factor is missing when $close exists, trading unit rounding will be disabled quote_df: pd.DataFrame @@ -141,7 +141,7 @@ def __init__( if deal_price is None: deal_price = C.deal_price - # we have some verbose information here. So logging is enable + # we have some verbose information here. So logging is enabled self.logger = get_module_logger("online operator") # TODO: the quote, trade_dates, codes are not necessary. @@ -168,7 +168,7 @@ def __init__( self.codes = codes # Necessary fields # $close is for calculating the total value at end of each day. - # - if $close is None, the stock on that day is reguarded as suspended. + # - if $close is None, the stock on that day is regarded as suspended. # $factor is for rounding to the trading unit # $change is for calculating the limit of the stock @@ -271,7 +271,7 @@ def _get_limit_type(self, limit_threshold: Union[tuple, float, None]) -> str: raise NotImplementedError(f"This type of `limit_threshold` is not supported") def _update_limit(self, limit_threshold: Union[Tuple, float, None]) -> None: - # $close is may contains NaN, the nan indicates that the stock is not tradable at that timestamp + # $close may contain NaN, the nan indicates that the stock is not tradable at that timestamp suspended = self.quote_df["$close"].isna() # check limit_threshold limit_type = self._get_limit_type(limit_threshold) @@ -356,12 +356,12 @@ def check_stock_limit( Returns ------- - True: the trading of the stock is limted (maybe hit the highest/lowest price), hence the stock is not tradable + True: the trading of the stock is limited (maybe hit the highest/lowest price), hence the stock is not tradable False: the trading of the stock is not limited, hence the stock may be tradable """ # NOTE: # **all** is used when checking limitation. - # For example, the stock trading is limited in a day if every miniute is limited in a day if every miniute is limited. + # For example, the stock trading is limited in a day if every minute is limited in a day if every minute is limited. if direction is None: # The trading limitation is related to the trading direction # if the direction is not provided, then any limitation from buy or sell will result in trading limitation @@ -385,17 +385,17 @@ def check_stock_suspended( # is suspended if stock_id in self.quote.get_all_stock(): # suspended stocks are represented by None $close stock - # The $close may contains NaN, + # The $close may contain NaN, close = self.quote.get_data(stock_id, start_time, end_time, "$close") if close is None: # if no close record exists return True elif isinstance(close, IndexData): - # **any** non-NaN $close represents trading opportunity may exists + # **any** non-NaN $close represents trading opportunity may exist # if all returned is nan, then the stock is suspended return cast(bool, cast(IndexData, close).isna().all()) else: - # it is single value, make sure is is not None + # it is single value, make sure is not None return np.isnan(close) else: # if the stock is not in the stock list, then it is not tradable and regarded as suspended @@ -540,8 +540,8 @@ def generate_amount_position_from_weight_position( direction: OrderDir = OrderDir.BUY, ) -> dict: """ - The generate the target position according to the weight and the cash. - NOTE: All the cash will assigned to the tradable stock. + Generates the target position according to the weight and the cash. + NOTE: All the cash will be assigned to the tradable stock. Parameter: weight_position : dict {stock_id : weight}; allocate cash by weight_position among then, weight must be in this range: 0 < weight < 1 @@ -639,7 +639,7 @@ def generate_order_for_target_amount_position( random.shuffle(sorted_ids) for stock_id in sorted_ids: - # Do not generate order for the nontradable stocks + # Do not generate order for the non-tradable stocks if not self.is_stock_tradable(stock_id=stock_id, start_time=start_time, end_time=end_time): continue diff --git a/qlib/contrib/data/handler.py b/qlib/contrib/data/handler.py index 07eb2da2585..ca3ca5545b0 100644 --- a/qlib/contrib/data/handler.py +++ b/qlib/contrib/data/handler.py @@ -57,7 +57,7 @@ def __init__( fit_end_time=None, filter_pipe=None, inst_processor=None, - **kwargs, + **kwargs ): infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) @@ -67,7 +67,7 @@ def __init__( "kwargs": { "config": { "feature": self.get_feature_config(), - "label": kwargs.get("label", self.get_label_config()), + "label": kwargs.pop("label", self.get_label_config()), }, "filter_pipe": filter_pipe, "freq": freq, @@ -82,12 +82,14 @@ def __init__( data_loader=data_loader, learn_processors=learn_processors, infer_processors=infer_processors, + **kwargs ) def get_label_config(self): - return (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]) + return ["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"] - def get_feature_config(self): + @staticmethod + def get_feature_config(): # NOTE: # Alpha360 tries to provide a dataset with original price data # the original price data includes the prices and volume in the last 60 days. @@ -99,33 +101,33 @@ def get_feature_config(self): names = [] for i in range(59, 0, -1): - fields += ["Ref($close, %d)/$close" % (i)] - names += ["CLOSE%d" % (i)] + fields += ["Ref($close, %d)/$close" % i] + names += ["CLOSE%d" % i] fields += ["$close/$close"] names += ["CLOSE0"] for i in range(59, 0, -1): - fields += ["Ref($open, %d)/$close" % (i)] - names += ["OPEN%d" % (i)] + fields += ["Ref($open, %d)/$close" % i] + names += ["OPEN%d" % i] fields += ["$open/$close"] names += ["OPEN0"] for i in range(59, 0, -1): - fields += ["Ref($high, %d)/$close" % (i)] - names += ["HIGH%d" % (i)] + fields += ["Ref($high, %d)/$close" % i] + names += ["HIGH%d" % i] fields += ["$high/$close"] names += ["HIGH0"] for i in range(59, 0, -1): - fields += ["Ref($low, %d)/$close" % (i)] - names += ["LOW%d" % (i)] + fields += ["Ref($low, %d)/$close" % i] + names += ["LOW%d" % i] fields += ["$low/$close"] names += ["LOW0"] for i in range(59, 0, -1): - fields += ["Ref($vwap, %d)/$close" % (i)] - names += ["VWAP%d" % (i)] + fields += ["Ref($vwap, %d)/$close" % i] + names += ["VWAP%d" % i] fields += ["$vwap/$close"] names += ["VWAP0"] for i in range(59, 0, -1): - fields += ["Ref($volume, %d)/($volume+1e-12)" % (i)] - names += ["VOLUME%d" % (i)] + fields += ["Ref($volume, %d)/($volume+1e-12)" % i] + names += ["VOLUME%d" % i] fields += ["$volume/($volume+1e-12)"] names += ["VOLUME0"] @@ -134,7 +136,7 @@ def get_feature_config(self): class Alpha360vwap(Alpha360): def get_label_config(self): - return (["Ref($vwap, -2)/Ref($vwap, -1) - 1"], ["LABEL0"]) + return ["Ref($vwap, -2)/Ref($vwap, -1) - 1"], ["LABEL0"] class Alpha158(DataHandlerLP): @@ -151,7 +153,7 @@ def __init__( process_type=DataHandlerLP.PTYPE_A, filter_pipe=None, inst_processor=None, - **kwargs, + **kwargs ): infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) @@ -161,7 +163,7 @@ def __init__( "kwargs": { "config": { "feature": self.get_feature_config(), - "label": kwargs.get("label", self.get_label_config()), + "label": kwargs.pop("label", self.get_label_config()), }, "filter_pipe": filter_pipe, "freq": freq, @@ -176,6 +178,7 @@ def __init__( infer_processors=infer_processors, learn_processors=learn_processors, process_type=process_type, + **kwargs ) def get_feature_config(self): @@ -190,7 +193,7 @@ def get_feature_config(self): return self.parse_config_to_fields(conf) def get_label_config(self): - return (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]) + return ["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"] @staticmethod def parse_config_to_fields(config): @@ -426,4 +429,4 @@ def use(x): class Alpha158vwap(Alpha158): def get_label_config(self): - return (["Ref($vwap, -2)/Ref($vwap, -1) - 1"], ["LABEL0"]) + return ["Ref($vwap, -2)/Ref($vwap, -1) - 1"], ["LABEL0"] diff --git a/qlib/contrib/model/pytorch_adarnn.py b/qlib/contrib/model/pytorch_adarnn.py index 3641bd511bd..7570d74e0ba 100644 --- a/qlib/contrib/model/pytorch_adarnn.py +++ b/qlib/contrib/model/pytorch_adarnn.py @@ -28,7 +28,7 @@ class ADARNN(Model): d_feat : int input dimension for each time step metric: str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : str diff --git a/qlib/contrib/model/pytorch_add.py b/qlib/contrib/model/pytorch_add.py index b214daed375..e929fe97f86 100644 --- a/qlib/contrib/model/pytorch_add.py +++ b/qlib/contrib/model/pytorch_add.py @@ -36,7 +36,7 @@ class ADD(Model): d_feat : int input dimensions for each time step metric : str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : int diff --git a/qlib/contrib/model/pytorch_alstm.py b/qlib/contrib/model/pytorch_alstm.py index 13e3bf87976..b0770e2bdde 100644 --- a/qlib/contrib/model/pytorch_alstm.py +++ b/qlib/contrib/model/pytorch_alstm.py @@ -30,7 +30,7 @@ class ALSTM(Model): d_feat : int input dimension for each time step metric: str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : int diff --git a/qlib/contrib/model/pytorch_alstm_ts.py b/qlib/contrib/model/pytorch_alstm_ts.py index 60645e2a3a4..3ab8ed8ab56 100644 --- a/qlib/contrib/model/pytorch_alstm_ts.py +++ b/qlib/contrib/model/pytorch_alstm_ts.py @@ -33,7 +33,7 @@ class ALSTM(Model): d_feat : int input dimension for each time step metric: str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : int diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py index a737bddc42d..1274088773f 100644 --- a/qlib/contrib/model/pytorch_gats.py +++ b/qlib/contrib/model/pytorch_gats.py @@ -33,7 +33,7 @@ class GATs(Model): d_feat : int input dimensions for each time step metric : str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : int diff --git a/qlib/contrib/model/pytorch_gats_ts.py b/qlib/contrib/model/pytorch_gats_ts.py index e8446d868e2..1b75efe890c 100644 --- a/qlib/contrib/model/pytorch_gats_ts.py +++ b/qlib/contrib/model/pytorch_gats_ts.py @@ -50,7 +50,7 @@ class GATs(Model): d_feat : int input dimensions for each time step metric : str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : int diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py index 2275b86e194..10998236bb9 100755 --- a/qlib/contrib/model/pytorch_gru.py +++ b/qlib/contrib/model/pytorch_gru.py @@ -30,7 +30,7 @@ class GRU(Model): d_feat : int input dimension for each time step metric: str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : str diff --git a/qlib/contrib/model/pytorch_gru_ts.py b/qlib/contrib/model/pytorch_gru_ts.py index 390a669244d..b588392a21e 100755 --- a/qlib/contrib/model/pytorch_gru_ts.py +++ b/qlib/contrib/model/pytorch_gru_ts.py @@ -31,7 +31,7 @@ class GRU(Model): d_feat : int input dimension for each time step metric: str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : str diff --git a/qlib/contrib/model/pytorch_hist.py b/qlib/contrib/model/pytorch_hist.py index 25445ba592b..f7b565dc543 100644 --- a/qlib/contrib/model/pytorch_hist.py +++ b/qlib/contrib/model/pytorch_hist.py @@ -34,7 +34,7 @@ class HIST(Model): d_feat : int input dimensions for each time step metric : str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : str diff --git a/qlib/contrib/model/pytorch_igmtf.py b/qlib/contrib/model/pytorch_igmtf.py index e3a07c3417f..d38ef9ad484 100644 --- a/qlib/contrib/model/pytorch_igmtf.py +++ b/qlib/contrib/model/pytorch_igmtf.py @@ -32,7 +32,7 @@ class IGMTF(Model): d_feat : int input dimension for each time step metric: str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : str diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py index 494fd4a0e0a..a68cf5eacba 100755 --- a/qlib/contrib/model/pytorch_lstm.py +++ b/qlib/contrib/model/pytorch_lstm.py @@ -29,7 +29,7 @@ class LSTM(Model): d_feat : int input dimension for each time step metric: str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : str diff --git a/qlib/contrib/model/pytorch_lstm_ts.py b/qlib/contrib/model/pytorch_lstm_ts.py index e703130fb27..f1a3c55e87e 100755 --- a/qlib/contrib/model/pytorch_lstm_ts.py +++ b/qlib/contrib/model/pytorch_lstm_ts.py @@ -30,7 +30,7 @@ class LSTM(Model): d_feat : int input dimension for each time step metric: str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : str diff --git a/qlib/contrib/model/pytorch_tcn.py b/qlib/contrib/model/pytorch_tcn.py index 8c40683fed6..2af7a04ea00 100755 --- a/qlib/contrib/model/pytorch_tcn.py +++ b/qlib/contrib/model/pytorch_tcn.py @@ -33,7 +33,7 @@ class TCN(Model): n_chans: int number of channels metric: str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : str diff --git a/qlib/contrib/model/pytorch_tcn_ts.py b/qlib/contrib/model/pytorch_tcn_ts.py index 13c125d27e7..4972a3065b3 100755 --- a/qlib/contrib/model/pytorch_tcn_ts.py +++ b/qlib/contrib/model/pytorch_tcn_ts.py @@ -30,7 +30,7 @@ class TCN(Model): d_feat : int input dimension for each time step metric: str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : str diff --git a/qlib/contrib/model/pytorch_tcts.py b/qlib/contrib/model/pytorch_tcts.py index 4f87e5f1e93..b46835cb655 100644 --- a/qlib/contrib/model/pytorch_tcts.py +++ b/qlib/contrib/model/pytorch_tcts.py @@ -29,7 +29,7 @@ class TCTS(Model): d_feat : int input dimension for each time step metric: str - the evaluate metric used in early stop + the evaluation metric used in early stop optimizer : str optimizer name GPU : str diff --git a/qlib/data/dataset/handler.py b/qlib/data/dataset/handler.py index 5d73ac6cea4..7815445c1e5 100644 --- a/qlib/data/dataset/handler.py +++ b/qlib/data/dataset/handler.py @@ -137,7 +137,7 @@ def setup_data(self, enable_cache: bool = False): # Setup data. # _data may be with multiple column index level. The outer level indicates the feature set name with TimeInspector.logt("Loading data"): - # make sure the fetch method is based on a index-sorted pd.DataFrame + # make sure the fetch method is based on an index-sorted pd.DataFrame self._data = lazy_sort_index(self.data_loader.load(self.instruments, self.start_time, self.end_time)) # TODO: cache @@ -167,7 +167,7 @@ def fetch( - a slice range - pd.Index for specific indexes - Following conflictions may occurs + Following conflicts may occur - Does ["20200101", "20210101"] mean selecting this slice or these two days? @@ -229,7 +229,7 @@ def _fetch_data( # This method is extracted for sharing in subclasses from .storage import BaseHandlerStorage # pylint: disable=C0415 - # Following conflictions may occurs + # Following conflicts may occur # - Does [20200101", "20210101"] mean selecting this slice or these two days? # To solve this issue # - slice have higher priorities (except when level is none) @@ -313,7 +313,7 @@ def get_range_iterator( self, periods: int, min_periods: Optional[int] = None, **kwargs ) -> Iterator[Tuple[pd.Timestamp, pd.DataFrame]]: """ - get a iterator of sliced data with given periods + get an iterator of sliced data with given periods Args: periods (int): number of periods. @@ -412,13 +412,13 @@ def __init__( process_type: str PTYPE_I = 'independent' - - self._infer will processed by infer_processors + - self._infer will be processed by infer_processors - self._learn will be processed by learn_processors PTYPE_A = 'append' - - self._infer will processed by infer_processors + - self._infer will be processed by infer_processors - self._learn will be processed by infer_processors + learn_processors @@ -671,7 +671,8 @@ def get_cols(self, col_set=DataHandler.CS_ALL, data_key: str = DK_I) -> list: def cast(cls, handler: "DataHandlerLP") -> "DataHandlerLP": """ Motivation - - A user create a datahandler in his customized package. Then he want to share the processed handler to other users without introduce the package dependency and complicated data processing logic. + - A user creates a datahandler in his customized package. Then he wants to share the processed handler to + other users without introduce the package dependency and complicated data processing logic. - This class make it possible by casting the class to DataHandlerLP and only keep the processed data Parameters @@ -685,7 +686,7 @@ def cast(cls, handler: "DataHandlerLP") -> "DataHandlerLP": the converted processed data """ new_hd: DataHandlerLP = object.__new__(DataHandlerLP) - new_hd.from_cast = True # add a mark for the casted instance + new_hd.from_cast = True # add a mark for the cast instance for key in list(DataHandlerLP.ATTR_MAP.values()) + [ "instruments", diff --git a/qlib/data/dataset/storage.py b/qlib/data/dataset/storage.py index a8ccdadaa3b..49afef9128d 100644 --- a/qlib/data/dataset/storage.py +++ b/qlib/data/dataset/storage.py @@ -8,7 +8,8 @@ class BaseHandlerStorage: - """Base data storage for datahandler + """ + Base data storage for datahandler - pd.DataFrame is the default data storage format in Qlib datahandler - If users want to use custom data storage, they should define subclass inherited BaseHandlerStorage, and implement the following method """ diff --git a/qlib/rl/utils/log.py b/qlib/rl/utils/log.py index e15bf7b54b4..2a113e47cd8 100644 --- a/qlib/rl/utils/log.py +++ b/qlib/rl/utils/log.py @@ -121,7 +121,7 @@ def add_any(self, name: str, obj: Any, loglevel: int | LogLevel = LogLevel.PERIO """Log something with any type. As it's an "any" object, the only LogWriter accepting it is pickle. - Therefore pickle must be able to serialize it. + Therefore, pickle must be able to serialize it. """ if loglevel < self._min_loglevel: return @@ -243,7 +243,7 @@ def log_episode(self, length: int, rewards: List[float], contents: List[Dict[str rewards A list of rewards at each step of this episode. contents - Logged contents for every steps. + Logged contents for every step. """ def log_step(self, reward: float, contents: Dict[str, Any]) -> None: @@ -285,7 +285,7 @@ def on_env_step(self, env_id: int, obs: ObsType, rew: float, done: bool, info: I self.log_episode(self.episode_lengths[env_id], self.episode_rewards[env_id], self.episode_logs[env_id]) - def on_env_reset(self, env_id: int, obs: ObsType) -> None: + def on_env_reset(self, env_id: int, _: ObsType) -> None: """Callback for finite env. Reset episode statistics. Nothing task-specific is logged here because of diff --git a/qlib/workflow/online/manager.py b/qlib/workflow/online/manager.py index 9a085ace513..35e73821c85 100644 --- a/qlib/workflow/online/manager.py +++ b/qlib/workflow/online/manager.py @@ -35,7 +35,7 @@ different time segments (based on whether or not any new model is online). ========================= =================================================================================== -Here is some pseudo code the demonstrate the workflow of each situation +Here is some pseudo code that demonstrate the workflow of each situation For simplicity - Only one strategy is used in the strategy diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index 2831482104f..ffda529da86 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -178,7 +178,7 @@ def generate_label(dataset): # The backend handler should be DataHandler raw_label = dataset.prepare(**params) except AttributeError as e: - # The data handler is initialize with `drop_raw=True`... + # The data handler is initialized with `drop_raw=True`... # So raw_label is not available logger.warning(f"Exception: {e}") raw_label = None diff --git a/qlib/workflow/utils.py b/qlib/workflow/utils.py index 5a2f28d2363..0f48c74f0b2 100644 --- a/qlib/workflow/utils.py +++ b/qlib/workflow/utils.py @@ -18,30 +18,30 @@ def experiment_exit_handler(): """ Method for handling the experiment when any unusual program ending occurs. The `atexit` handler should be put in the last, since, as long as the program ends, it will be called. - Thus, if any exception or user interuption occurs beforehead, we should handle them first. Once `R` is + Thus, if any exception or user interruption occurs beforehand, we should handle them first. Once `R` is ended, another call of `R.end_exp` will not take effect. Limitations: - - If pdb is used in the your program, excepthook will not be triggered when it ends. The status will be finished + - If pdb is used in your program, excepthook will not be triggered when it ends. The status will be finished """ sys.excepthook = experiment_exception_hook # handle uncaught exception atexit.register(R.end_exp, recorder_status=Recorder.STATUS_FI) # will not take effect if experiment ends -def experiment_exception_hook(type, value, tb): +def experiment_exception_hook(exc_type, value, tb): """ End an experiment with status to be "FAILED". This exception tries to catch those uncaught exception and end the experiment automatically. Parameters - type: Exception type + exc_type: Exception type value: Exception's value tb: Exception's traceback """ - logger.error(f"An exception has been raised[{type.__name__}: {value}].") + logger.error(f"An exception has been raised[{exc_type.__name__}: {value}].") # Same as original format traceback.print_tb(tb) - print(f"{type.__name__}: {value}") + print(f"{exc_type.__name__}: {value}") R.end_exp(recorder_status=Recorder.STATUS_FA) diff --git a/scripts/data_collector/crowd_source/README.md b/scripts/data_collector/crowd_source/README.md index 14ddab154ec..cdf36564b2f 100644 --- a/scripts/data_collector/crowd_source/README.md +++ b/scripts/data_collector/crowd_source/README.md @@ -1,9 +1,9 @@ # Crowd Source Data ## Initiative -Public data source like yahoo is flawed, it might miss data for stock which is delisted and it might has data which is wrong. This can introduce survivorship bias into our training process. +Public data source like yahoo is flawed, it might miss data for stock which is delisted and it might have data which is wrong. This can introduce survivorship bias into our training process. -The crowd sourced data is introduced to merged data from multiple data source and cross validate against each other, so that: +The Crowd Source Data is introduced to merged data from multiple data source and cross validate against each other, so that: 1. We will have a more complete history record. 2. We can identify the anomaly data and apply correction when necessary. @@ -12,7 +12,7 @@ The raw data is hosted on dolthub repo: https://www.dolthub.com/repositories/che The processing script and sql is hosted on github repo: https://github.com/chenditc/investment_data -The pakcaged docker runtime is hosted on dockerhub: https://hub.docker.com/repository/docker/chenditc/investment_data +The packaged docker runtime is hosted on dockerhub: https://hub.docker.com/repository/docker/chenditc/investment_data ## How to use it in qlib ### Option 1: Download release bin data From c4ee9ff88266665046d78e3f338f35a4c0afb8d4 Mon Sep 17 00:00:00 2001 From: Linlang <30293408+SunsetWolf@users.noreply.github.com> Date: Sun, 20 Nov 2022 14:18:35 +0800 Subject: [PATCH 12/15] Fixed log_param error (#1362) * fix_qrun_error * add_description --- qlib/workflow/recorder.py | 2 ++ setup.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/qlib/workflow/recorder.py b/qlib/workflow/recorder.py index 33903f7ed3c..4b3fde20fcf 100644 --- a/qlib/workflow/recorder.py +++ b/qlib/workflow/recorder.py @@ -21,6 +21,8 @@ from mlflow.store.artifact.azure_blob_artifact_repo import AzureBlobArtifactRepository logger = get_module_logger("workflow", logging.INFO) +# mlflow limits the length of log_param to 500, but this caused errors when using qrun, so we extended the mlflow limit. +mlflow.utils.validation.MAX_PARAM_VAL_LENGTH = 1000 class Recorder: diff --git a/setup.py b/setup.py index faf058d6315..9ff13d4a51f 100644 --- a/setup.py +++ b/setup.py @@ -62,6 +62,8 @@ def get_version(rel_path: str) -> str: "matplotlib>=3.3", "tables>=3.6.1", "pyyaml>=5.3.1", + # To ensure stable operation of the experiment manager, we have limited the version of mlflow, + # and we need to verify whether version 2.0 of mlflow can serve qlib properly. "mlflow>=1.12.1, <=1.30.0", "tqdm", "loguru", From 7e5bab599ae2db49aa4fc1f462c73f67d9246a57 Mon Sep 17 00:00:00 2001 From: Di Date: Mon, 28 Nov 2022 14:02:44 +0800 Subject: [PATCH 13/15] Add early stopping to double ensemble model, add example (#1375) * Add early stopping to double ensemble model, add example * Fix lint error --- ...ig_doubleensemble_early_stop_Alpha158.yaml | 95 +++++++++++++++++++ qlib/contrib/model/double_ensemble.py | 11 ++- 2 files changed, 104 insertions(+), 2 deletions(-) create mode 100644 examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_early_stop_Alpha158.yaml diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_early_stop_Alpha158.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_early_stop_Alpha158.yaml new file mode 100644 index 00000000000..b3c38870e6e --- /dev/null +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_early_stop_Alpha158.yaml @@ -0,0 +1,95 @@ +qlib_init: + provider_uri: "~/.qlib/qlib_data/cn_data" + region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy + kwargs: + signal: + - + - + topk: 50 + n_drop: 5 + backtest: + start_time: 2017-01-01 + end_time: 2020-08-01 + account: 100000000 + benchmark: *benchmark + exchange_kwargs: + limit_threshold: 0.095 + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: DEnsembleModel + module_path: qlib.contrib.model.double_ensemble + kwargs: + base_model: "gbm" + loss: mse + num_models: 3 + enable_sr: True + enable_fs: True + alpha1: 1 + alpha2: 1 + bins_sr: 10 + bins_fs: 5 + decay: 0.5 + sample_ratios: + - 0.8 + - 0.7 + - 0.6 + - 0.5 + - 0.4 + sub_weights: + - 1 + - 1 + - 1 + epochs: 1000 + early_stopping_rounds: 50 + colsample_bytree: 0.8879 + learning_rate: 0.2 + subsample: 0.8789 + lambda_l1: 205.6999 + lambda_l2: 580.9768 + max_depth: 8 + num_leaves: 210 + num_threads: 20 + verbosity: -1 + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha158 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: + model: + dataset: + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: False + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py index 50c3d22b47c..f0b2188d061 100644 --- a/qlib/contrib/model/double_ensemble.py +++ b/qlib/contrib/model/double_ensemble.py @@ -30,6 +30,7 @@ def __init__( sample_ratios=None, sub_weights=None, epochs=100, + early_stopping_rounds=None, **kwargs ): self.base_model = base_model # "gbm" or "mlp", specifically, we use lgbm for "gbm" @@ -59,6 +60,7 @@ def __init__( self.params = {"objective": loss} self.params.update(kwargs) self.loss = loss + self.early_stopping_rounds = early_stopping_rounds def fit(self, dataset: DatasetH): df_train, df_valid = dataset.prepare( @@ -103,14 +105,19 @@ def fit(self, dataset: DatasetH): def train_submodel(self, df_train, df_valid, weights, features): dtrain, dvalid = self._prepare_data_gbm(df_train, df_valid, weights, features) evals_result = dict() + + callbacks = [lgb.log_evaluation(20), lgb.record_evaluation(evals_result)] + if self.early_stopping_rounds: + callbacks.append(lgb.early_stopping(self.early_stopping_rounds)) + self.logger.info("Training with early_stopping...") + model = lgb.train( self.params, dtrain, num_boost_round=self.epochs, valid_sets=[dtrain, dvalid], valid_names=["train", "valid"], - verbose_eval=20, - evals_result=evals_result, + callbacks=callbacks, ) evals_result["train"] = list(evals_result["train"].values())[0] evals_result["valid"] = list(evals_result["valid"].values())[0] From 4f5ae4d2247a37e899c879b95c01ed69468fa918 Mon Sep 17 00:00:00 2001 From: YQ Tsui Date: Mon, 28 Nov 2022 18:06:29 +0800 Subject: [PATCH 14/15] fix csi500 end date issue (#1373) --- scripts/data_collector/cn_index/collector.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/data_collector/cn_index/collector.py b/scripts/data_collector/cn_index/collector.py index 9b91fc05653..40fbe4c9a49 100644 --- a/scripts/data_collector/cn_index/collector.py +++ b/scripts/data_collector/cn_index/collector.py @@ -394,7 +394,7 @@ def get_history_companies(self) -> pd.DataFrame: type: str, value from ["add", "remove"] """ bs.login() - today = pd.datetime.now() + today = pd.Timestamp.now() date_range = pd.DataFrame(pd.date_range(start="2007-01-15", end=today, freq="7D"))[0].dt.date ret_list = [] col = ["date", "symbol", "code_name"] @@ -410,7 +410,8 @@ def get_history_companies(self) -> pd.DataFrame: bs.logout() return pd.concat(ret_list, sort=False) - def get_data_from_baostock(self, date) -> pd.DataFrame: + @staticmethod + def get_data_from_baostock(date) -> pd.DataFrame: """ Data source: http://baostock.com/baostock/index.php/%E4%B8%AD%E8%AF%81500%E6%88%90%E5%88%86%E8%82%A1 Avoid a large number of parallel data acquisition, @@ -452,13 +453,13 @@ def get_new_companies(self) -> pd.DataFrame: end_date: pd.Timestamp """ logger.info("get new companies......") - today = datetime.date.today() + today = pd.Timestamp.now().normalize() bs.login() - result = self.get_data_from_baostock(today) + result = self.get_data_from_baostock(today.strftime("%Y-%m-%d")) bs.logout() df = result[["date", "symbol"]] df.columns = [self.END_DATE_FIELD, self.SYMBOL_FIELD_NAME] - df[self.END_DATE_FIELD] = pd.to_datetime(df[self.END_DATE_FIELD].astype(str)) + df[self.END_DATE_FIELD] = today df[self.START_DATE_FIELD] = self.bench_start_date logger.info("end of get new companies.") return df From 6a47416a2d21489578945ee085758608ed5e8bd2 Mon Sep 17 00:00:00 2001 From: YQ Tsui Date: Tue, 29 Nov 2022 08:09:22 +0800 Subject: [PATCH 15/15] Fix logging_level: make logging level specified in qlib.init applies to all loggers (#1368) * fix logging_level: make logging level specified in qlib.init apply to all loggers * downgrade loglevel in expmanager __init__ to debug (it will be called in each process in multiprocessing operations such as read data) * correct gramma error * fix black lint * use functor to cache loggers and set level * correct black lint * correct pylint * correct pylint --- .../developer/code_standard_and_dev_guide.rst | 2 +- qlib/__init__.py | 6 +-- qlib/config.py | 3 +- qlib/log.py | 51 +++++++++++-------- qlib/workflow/exp.py | 9 ++-- qlib/workflow/expm.py | 2 +- qlib/workflow/recorder.py | 3 +- 7 files changed, 42 insertions(+), 34 deletions(-) diff --git a/docs/developer/code_standard_and_dev_guide.rst b/docs/developer/code_standard_and_dev_guide.rst index 79a7778ad1a..87f193b8e5e 100644 --- a/docs/developer/code_standard_and_dev_guide.rst +++ b/docs/developer/code_standard_and_dev_guide.rst @@ -16,7 +16,7 @@ When you submit a PR request, you can check whether your code passes the CI test 1. Qlib will check the code format with black. The PR will raise error if your code does not align to the standard of Qlib(e.g. a common error is the mixed use of space and tab). - You can fix the bug by inputing the following code in the command line. + You can fix the bug by inputting the following code in the command line. .. code-block:: bash diff --git a/qlib/__init__.py b/qlib/__init__.py index 3a666c5bef5..61113244e9b 100644 --- a/qlib/__init__.py +++ b/qlib/__init__.py @@ -10,7 +10,7 @@ import logging import platform import subprocess -from .log import get_module_logger +from .log import get_module_logger, set_global_logger_level # init qlib @@ -34,8 +34,7 @@ def init(default_conf="client", **kwargs): from .config import C # pylint: disable=C0415 from .data.cache import H # pylint: disable=C0415 - # FIXME: this logger ignored the level in config - logger = get_module_logger("Initialization", level=logging.INFO) + logger = get_module_logger("Initialization") skip_if_reg = kwargs.pop("skip_if_reg", False) if skip_if_reg and C.registered: @@ -48,6 +47,7 @@ def init(default_conf="client", **kwargs): if clear_mem_cache: H.clear() C.set(default_conf, **kwargs) + get_module_logger.setLevel(C.logging_level) # mount nfs for _freq, provider_uri in C.provider_uri.items(): diff --git a/qlib/config.py b/qlib/config.py index 4b4123643ce..63773fcef29 100644 --- a/qlib/config.py +++ b/qlib/config.py @@ -411,8 +411,7 @@ def set(self, default_conf: str = "client", **kwargs): if _logging_config: set_log_with_config(_logging_config) - # FIXME: this logger ignored the level in config - logger = get_module_logger("Initialization", level=logging.INFO) + logger = get_module_logger("Initialization", kwargs.get("logging_level", self.logging_level)) logger.info(f"default_conf: {default_conf}.") self.set_mode(default_conf) diff --git a/qlib/log.py b/qlib/log.py index a2fbde4b5db..115abc137f3 100644 --- a/qlib/log.py +++ b/qlib/log.py @@ -48,33 +48,44 @@ def __getattr__(self, name): return self.logger.__getattribute__(name) -def get_module_logger(module_name, level: Optional[int] = None) -> QlibLogger: - """ - Get a logger for a specific module. +class _QLibLoggerManager: + def __init__(self): + self._loggers = {} - :param module_name: str - Logic module name. - :param level: int - :return: Logger - Logger object. - """ - if level is None: - level = C.logging_level + def setLevel(self, level): + for logger in self._loggers.values(): + logger.setLevel(level) + + def __call__(self, module_name, level: Optional[int] = None) -> QlibLogger: + """ + Get a logger for a specific module. + + :param module_name: str + Logic module name. + :param level: int + :return: Logger + Logger object. + """ + if level is None: + level = C.logging_level + + if not module_name.startswith("qlib."): + # Add a prefix of qlib. when the requested ``module_name`` doesn't start with ``qlib.``. + # If the module_name is already qlib.xxx, we do not format here. Otherwise, it will become qlib.qlib.xxx. + module_name = "qlib.{}".format(module_name) + + # Get logger. + module_logger = self._loggers.setdefault(module_name, QlibLogger(module_name)) + module_logger.setLevel(level) + return module_logger - if not module_name.startswith("qlib."): - # Add a prefix of qlib. when the requested ``module_name`` doesn't start with ``qlib.``. - # If the module_name is already qlib.xxx, we do not format here. Otherwise, it will become qlib.qlib.xxx. - module_name = "qlib.{}".format(module_name) - # Get logger. - module_logger = QlibLogger(module_name) - module_logger.setLevel(level) - return module_logger +get_module_logger = _QLibLoggerManager() class TimeInspector: - timer_logger = get_module_logger("timer", level=logging.INFO) + timer_logger = get_module_logger("timer") time_marks = [] diff --git a/qlib/workflow/exp.py b/qlib/workflow/exp.py index 95e5db47380..d0adda66ea9 100644 --- a/qlib/workflow/exp.py +++ b/qlib/workflow/exp.py @@ -4,13 +4,12 @@ from typing import Dict, List, Union from qlib.typehint import Literal import mlflow -import logging from mlflow.entities import ViewType from mlflow.exceptions import MlflowException from .recorder import Recorder, MLflowRecorder from ..log import get_module_logger -logger = get_module_logger("workflow", logging.INFO) +logger = get_module_logger("workflow") class Experiment: @@ -22,7 +21,7 @@ class Experiment: def __init__(self, id, name): self.id = id self.name = name - self.active_recorder = None # only one recorder can running each time + self.active_recorder = None # only one recorder can run each time self._default_rec_name = "abstract_recorder" def __repr__(self): @@ -232,7 +231,7 @@ def list_recorders( Returns ------- - The return type depent on `rtype` + The return type depends on `rtype` if `rtype` == "dict": A dictionary (id -> recorder) of recorder information that being stored. elif `rtype` == "list": @@ -354,7 +353,7 @@ def list_recorders( Parameters ---------- max_results : int - the number limitation of the results + the number limitation of the results' status : str the criteria based on status to filter results. `None` indicates no filtering. diff --git a/qlib/workflow/expm.py b/qlib/workflow/expm.py index 3aaa574dd2f..be6b494e054 100644 --- a/qlib/workflow/expm.py +++ b/qlib/workflow/expm.py @@ -37,7 +37,7 @@ def __init__(self, uri: Text, default_exp_name: Optional[Text]): self._active_exp_uri = None # No active experiments. So it is set to None self._default_exp_name = default_exp_name self.active_experiment = None # only one experiment can be active each time - logger.info(f"experiment manager uri is at {self.uri}") + logger.debug(f"experiment manager uri is at {self.uri}") def __repr__(self): return "{name}(uri={uri})".format(name=self.__class__.__name__, uri=self.uri) diff --git a/qlib/workflow/recorder.py b/qlib/workflow/recorder.py index 4b3fde20fcf..4d0fd8a239c 100644 --- a/qlib/workflow/recorder.py +++ b/qlib/workflow/recorder.py @@ -5,7 +5,6 @@ import sys from typing import Optional import mlflow -import logging import shutil import pickle import tempfile @@ -20,7 +19,7 @@ from ..log import TimeInspector, get_module_logger from mlflow.store.artifact.azure_blob_artifact_repo import AzureBlobArtifactRepository -logger = get_module_logger("workflow", logging.INFO) +logger = get_module_logger("workflow") # mlflow limits the length of log_param to 500, but this caused errors when using qrun, so we extended the mlflow limit. mlflow.utils.validation.MAX_PARAM_VAL_LENGTH = 1000