Skip to content
63 changes: 47 additions & 16 deletions sdp/processors/modify_manifest/data_to_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,36 +405,67 @@ def finalize(self, metrics):


class SubRegex(BaseParallelProcessor):
"""Converts a regex match to a string, as defined by key-value pairs in ``regex_to_sub``.
"""
Applies a sequence of regex substitutions to the specified text field in each data entry.

This processor performs regex-based substitutions as defined in either a provided list of
regex parameter dictionaries or a YAML configuration file. Each substitution is applied in
the order specified.

Before applying regex changes, we will add a space
character to the beginning and end of the ``text`` and ``pred_text``
keys for each data entry. After the the regex changes,
the extra spaces are removed. This includes the spaces in the beginning
and end of the text, as well as any double spaces ``" "``.
Before substitutions are applied, a space is temporarily added to the beginning and end of the text
to improve regex match consistency. After all substitutions, leading/trailing spaces and repeated
spaces are removed.

Args:
regex_params_list (list[dict]): list of dicts.
Each dict must contain a ``pattern`` and a ``repl`` key,
and optionally a ``count`` key (by default, ``count`` will be 0).
This processor will go through the list in order, and apply a ``re.sub`` operation on
the input text in ``data_entry[self.text_key]``, feeding in the specified ``pattern``, ``repl``
and ``count`` parameters to ``re.sub``.
text_key (str): a string indicating which key of the data entries
should be used to find the utterance transcript. Defaults to "text".
regex_params_list (List[Dict], optional): A list of dictionaries specifying the regex substitutions.
Each dictionary must include::

- "pattern": A regex pattern to match.
- "repl": A replacement string.
- "count" (optional): Maximum number of replacements to make. Defaults to 0 (replace all).

regex_params_yaml (str, optional): Path to a YAML file that defines the same list of dictionaries
as `regex_params_list`. Either `regex_params_list` or `regex_params_yaml` must be provided.
If both are provided, `regex_params_yaml` takes precedence.

text_key (str): The key in each data entry whose value will be modified. Defaults to "text".

**kwargs: Additional arguments passed to the BaseParallelProcessor.

Example YAML format for `regex_params_yaml`:
```
# regex_params.yaml
- {"pattern": "♩", "repl": " "}
- {"pattern": "♭", "repl": " "}
- {"pattern": "\\|", "repl": " "}
- {"pattern": ":", "repl": " "}
- {"pattern": "-", "repl": " "}
- {"pattern": "[^ €₽₴$£%?!',.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя]", "repl": ""}
- {"pattern": "\\s+\\.", "repl": "."}
- {"pattern": "\\?+", "repl": "?"}
- {"pattern": "\\.+", "repl": "."}
```

Returns:
The same data as in the input manifest with ``<text_key>`` field changed.
The same data as in the input manifest with ``<text_key>`` field changed.
"""

def __init__(
self,
regex_params_list: List[Dict],
regex_params_list: List[Dict] = None,
regex_params_yaml: str = None,
text_key: str = "text",
**kwargs,
):
super().__init__(**kwargs)
if not regex_params_list and not regex_params_yaml:
raise ValueError(f'One of `regex_params_list` or `regex_params_yaml` should be provided.')

self.regex_params_list = regex_params_list
if regex_params_yaml:
with open(regex_params_yaml, 'r') as regex_params_file:
self.regex_params_list = yaml.safe_load(regex_params_file)

self.text_key = text_key

# verify all dicts in regex_params_list have "pattern" and "repl" keys
Expand Down
Loading