diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index ec28562b..7920721e 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -62,6 +62,9 @@ jobs: then pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn; else pip install "scikit-learn~=${{ matrix.sklearn_version }}"; fi + if [ ${{ matrix.os }} == "ubuntu-latest" ]; + then wget -q https://github.com/jgm/pandoc/releases/download/2.19.2/pandoc-2.19.2-1-amd64.deb && sudo dpkg -i pandoc-2.19.2-1-amd64.deb; + fi python --version pip --version pip list diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 247ca63a..7e72ddf0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,6 +6,7 @@ repos: exclude: .github/conda/meta.yaml - id: end-of-file-fixer - id: trailing-whitespace + exclude: skops/card/tests/examples - id: check-case-conflict - id: check-merge-conflict - repo: https://github.com/psf/black diff --git a/docs/changes.rst b/docs/changes.rst index 5bd8694e..d165b401 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -20,6 +20,9 @@ v0.5 enabled, will result in the Hugging Face inference API running with Intel's scikit-learn intelex library, which can accelerate inference times. :pr:`267` by `Benjamin Bossan`_. +- Model cards that have been written into a markdown file can now be parsed back + into a :class:`skops.card.Card` object and edited further by using the + :func:`skops.card.parse_modelcard` function. :pr:`257` by `Benjamin Bossan`_. v0.4 ---- diff --git a/docs/model_card.rst b/docs/model_card.rst index 7a6c124d..d3b726d4 100644 --- a/docs/model_card.rst +++ b/docs/model_card.rst @@ -11,6 +11,9 @@ beginning of it, following with the content of the model card in markdown format. The metadata section is used to make models searchable on the Hub, and get the inference API and the widgets on the website working. +Metadata +-------- + The metadata part of the file needs to follow the specifications `here `__. It includes simple attributes of your models such as the task you're solving, @@ -40,6 +43,9 @@ Here's an example of the metadata section of the ``README.md`` file: ``skops`` creates this section of the file for you, and you almost never need to touch it yourself. +Model Card Content +------------------ + The markdown part does not necessarily need to follow any specification in terms of information passed, which gives the user a lot of flexibility. The markdown part of the ``README.md`` file comes with a couple of defaults provided @@ -90,8 +96,8 @@ as well as adding some subsections with plots below that, you can call the }) Furthermore, you can select existing sections (as well as their subsections) -using :meth:`Card.select`, and you can delete sections using -:meth:`Card.delete`: +using :meth:`.Card.select`, and you can delete sections using +:meth:`.Card.delete`: .. code-block:: python @@ -103,3 +109,43 @@ using :meth:`Card.select`, and you can delete sections using To see how you can use the API in ``skops`` to create a model card, please refer to :ref:`sphx_glr_auto_examples_plot_model_card.py`. + +Saving and Loading Model Cards +------------------------------ + +Once you have finished creating and modifying the model card, you can save it +using the :meth:`.Card.save` method: + +.. code-block:: python + + card.save("README.md") + +This renders the content of the model card to markdown format and stores it in +the indicated file. It is now ready to be uploaded to Hugging Face Hub. + +If you have a finished model card but want to load to make some modifications, +you can use the function :func:`skops.card.parse_modelcard`. This function +parses the model card back into a :class:`.Card` instance that you can work on +further: + +.. code-block:: python + + from skops import card + model_card = card.parse_modelcard("README.md") + model_card.add(**{"A new section": "Some new content"}) + model_card.save("README.md") + +When the card is parsed, some minor details of the model card can change, e.g. +if you used different column alignment than the default, this could change, as +well as removing excess empty lines or trailing whitespace. However, the content +itself should be exactly the same. All known deviations are documented in the +`parse_modelcard docs +`_ + +For the parsing part, we rely on `pandoc `_. If you haven't +installed it, please follow `these instructions +`_. The advantage of using pandoc is that +it's a very mature library and that it supports many different document formats. +Therefore, it should be possible to parse model cards even if they use a format +that's not markdown, for instance reStructuredText, org, or asciidoc. For +saving, we only support markdown for now. diff --git a/skops/_min_dependencies.py b/skops/_min_dependencies.py index 565263fc..24edeb01 100644 --- a/skops/_min_dependencies.py +++ b/skops/_min_dependencies.py @@ -25,6 +25,7 @@ "sphinx-prompt": ("1.3.0", "docs", None), "sphinx-issues": ("1.2.0", "docs", None), "matplotlib": ("3.3", "docs, tests", None), + "packaging": ("17.0", "install", None), "pandas": ("1", "docs, tests", None), # required for persistence tests of external libraries "lightgbm": ("3", "tests", None), diff --git a/skops/card/__init__.py b/skops/card/__init__.py index 80ae1963..0febe42a 100644 --- a/skops/card/__init__.py +++ b/skops/card/__init__.py @@ -1,3 +1,4 @@ from ._model_card import Card, metadata_from_config +from ._parser import parse_modelcard -__all__ = ["Card", "metadata_from_config"] +__all__ = ["Card", "metadata_from_config", "parse_modelcard"] diff --git a/skops/card/_markup.py b/skops/card/_markup.py new file mode 100644 index 00000000..7779a675 --- /dev/null +++ b/skops/card/_markup.py @@ -0,0 +1,338 @@ +"""Classes for translating into the syntax of different markup languages""" + +from __future__ import annotations + +import sys +from collections.abc import Mapping +from contextlib import contextmanager +from typing import Any, Sequence + +from skops.card._model_card import TableSection + +if sys.version_info.minor >= 9: + from typing import TypedDict +else: + from typing_extensions import TypedDict + + +class PandocItem(TypedDict): + t: str + c: dict + + +class Markdown: + """Mapping of pandoc parsed document to Markdown + + This class has a ``mapping`` attribute, which is just a dict. The keys are + Pandoc types and the values are functions that transform the corresponding + value into a string with markdown syntax. Those functions are all prefixed + with ``_``, e.g. ``_image`` for transforming a pandoc ``Image`` into a + markdown figure, or ``_raw_block``, to transform a pandoc ``RawBlock``. + + From the caller side, only the ``__call__`` method should be used, the rest + should be considered internals. + + """ + + def __init__(self): + # markdown syntax dispatch table + self.mapping = { + "Space": self._space, + "Plain": self._plain, + "Str": self._str, + "Strong": self._strong, + "Emph": self._emph, + "Strikeout": self._strikeout, + "RawInline": self._raw_inline, + "RawBlock": self._raw_block, + "SoftBreak": self._soft_break, + "Para": self._para, + "Header": self._header, + "Image": self._image, + "Figure": self._figure, + "CodeBlock": self._code_block, + "Code": self._code, + "Table": self._table, + "Div": self._parse_div, + "Link": self._link, + "BulletList": self._bullet_list, + "OrderedList": self._ordered_list, + "Quoted": self._quoted, + "BlockQuote": self._block_quote, + } + # Start indentation level at -1 because we want the first incremented + # indentation level to be at 0. Otherwise we would need to keep track if + # it's the first time and then don't increment, which is more + # complicated. + self._indent_trace = [] + + @contextmanager + def _indented(self, *, spaces: int): + """Temporarily increment indentation by one""" + self._indent_trace.append(spaces) + yield + self._indent_trace.pop(-1) + + def _get_indent(self, *, incr: int = 0) -> str: + """Get current indentation, optionally incremented""" + # TODO: explain why skipping 1st item + return " " * (incr + sum(self._indent_trace[:-1])) + + @staticmethod + def _space(value) -> str: + return " " + + def _plain(self, value) -> str: + parts = [self.__call__(subitem) for subitem in value] + return "".join(parts) + + @staticmethod + def _str(value) -> str: + # escape \ + return value.replace("\\", "\\\\") + + def _strong(self, value) -> str: + parts = ["**"] + parts += [self.__call__(subitem) for subitem in value] + parts.append("**") + return "".join(parts) + + def _emph(self, value) -> str: + parts = ["_"] + parts += [self.__call__(subitem) for subitem in value] + parts.append("_") + return "".join(parts) + + def _strikeout(self, value) -> str: + parts = ["~~"] + parts += [self.__call__(subitem) for subitem in value] + parts.append("~~") + return "".join(parts) + + @staticmethod + def _raw_inline(value) -> str: + _, line = value + return line + + def _raw_block(self, item) -> str: + # throw away the first item, which is just something like 'html' + # might have to revisit this if output != markdown + _, line = item + return line + + def _soft_break(self, value) -> str: + incr = 0 if not self._indent_trace else self._indent_trace[-1] + return "\n" + self._get_indent(incr=incr) + + def _make_content(self, content): + parts = [] + for item in content: + part = "".join(self.__call__(item)) + parts.append(part) + return "".join(parts) + + def _para(self, value: list[dict[str, str]]) -> str: + content = self._make_content(value) + return content + + def _header(self, value: tuple[int, Any, list[dict[str, str]]]) -> str: + level, _, content_parts = value + section_name = self._make_content(content_parts) + return section_name + + def _image(self, value) -> str: + (ident, _, keyvals), caption, (dest, typef) = value + # it seems like ident and keyvals are not relevant for markdown + + if not caption: # pragma: no cover + # not sure if this can be reached, just to be safe + raise ValueError("Figure missing a caption") + + if not typef.startswith("fig:"): # pragma: no cover + # not sure if this can be reached, just to be safe + raise ValueError(f"Cannot deal with figure of type '{typef}'") + + caption = "".join(self.__call__(i) for i in caption) + content = f"![{caption}]({dest})" + return content + + def _figure(self, value) -> str: # pragma: no cover + # Figure type was added in Pandoc v3.0 + (ident, classes, keyvals), caption, (body,) = value + + body_type = body["t"] + # we can only deal with plain figures for now + if body_type != "Plain": + raise ValueError(f"Cannot deal with figure of type '{body_type}'") + + plain_fig = body["c"][0]["c"] + plain_fig[2][1] = "fig:" + return self._image(plain_fig) + + @staticmethod + def _code_block(item: tuple[tuple[int, list[str], list[str]], str]) -> str: + # a codeblock consists of: (id, classes, namevals) contents + (_, classes, _), content = item + block_start = "```" + if classes: + block_start += ", ".join(classes) + block_end = "```" + content = "\n".join((block_start, content, block_end)) + return content + + @staticmethod + def _code(item: tuple[Any, str]) -> str: + _, txt = item + return f"`{txt}`" + + def _table_cols(self, items) -> list[str]: + columns = [] + fn = self.__call__ + for item in items: + _, alignment, _, _, content = item + column = "".join(fn(part) for part in content) + columns.append(column) + return columns + + def _table_body(self, items) -> list[list[str]]: + body = [] + fn = self.__call__ + for _, row_items in items: + row = [] + for col_row_item in row_items: + _, alignment, _, _, content = col_row_item + row.append("".join(fn(part) for part in content)) + body.append(row) + return body + + def _table(self, item) -> str: + # attr capt specs thead tbody tfoot + _, _, _, thead, tbody, _ = item + + # header + (_, thead_bodies) = thead + (attr, thead_body) = thead_bodies[0] # multiple headers? + + columns = self._table_cols(thead_body) + + # rows + # attr rhc hd bd + _, _, _, trows = tbody[0] # multiple groups of rows? + body = self._table_body(trows) + + table: Mapping[str, Sequence[Any]] + if not body: + table = {key: [] for key in columns} + else: + # body is row oriented, transpose to column oriented + data_transposed = zip(*body) + table = {key: val for key, val in zip(columns, data_transposed)} + + res = TableSection(table).format() + return res + + def _parse_div(self, item) -> str: + # note that in markdown, we basically just use the raw html + (ident, classes, kvs), contents = item + + # build div tag + tags = ["") + + start = "".join(tags) + middle = [] + for content in contents: + with self._indented(spaces=2): + middle.append(self.__call__(content)) + end = "" + return "".join([start] + middle + [end]) + + def _link(self, item) -> str: + _, txt, (src, _) = item + txt_formatted = self._make_content(txt) + return f"[{txt_formatted}]({src})" + + def _make_list_item(self, items: str, list_marker: str): + # helper function used for bullet and ordered lists + parts = [self.__call__(subitem) for subitem in items] + content = "\n".join(parts) + return f"{self._get_indent()}{list_marker} {content}" + + def _bullet_list(self, item) -> str: + # we don't differentiate between lists starting with "-", "*", or "+". + list_marker = "-" + parts = [] + # bullet lists use 2 spaces for indentation to align "- " + with self._indented(spaces=2): + for subitem in item: + parts.append(self._make_list_item(subitem, list_marker=list_marker)) + return "\n".join(parts) + + def _ordered_list(self, item) -> str: + # we don't make use of num_type and sep_type, which just indicates that + # numbers are presented as decimal numbers using a period + (start, num_type, sep_type), items = item + parts = [] + # ordered lists use 3 spaces for indentation to align "1. " + with self._indented(spaces=3): + for i, subitem in enumerate(items, start=start): + parts.append(self._make_list_item(subitem, list_marker=f"{i}.")) + return "\n".join(parts) + + def _quoted(self, item: tuple[dict[str, str], list[PandocItem]]) -> str: + quote_type, content = item + type_ = quote_type["t"] + try: + sym = {"DoubleQuote": '"', "SingleQuote": "'"}[type_] + except KeyError as exc: # pragma: no cover + # can probably not be reached, but let's be sure + msg = ( + f"The parsed document contains '{type_}', which is not " + "supported yet, please open an issue on GitHub" + ) + raise ValueError(msg) from exc + + text = "".join(self.__call__(i) for i in content) + return f"{sym}{text}{sym}" + + def _block_quote(self, item: list[PandocItem]) -> str: + parts = [] + for subitem in item: + content = self.__call__(subitem) + # add quote symbolx + content = content.replace("\n", "\n> ") + parts.append(content) + + # add a quote symbol to the very start + text = "> " + "\n> ".join(parts) + return text + + def __call__(self, item: str | PandocItem) -> str: + if isinstance(item, str): + return item + + type_, value = item["t"], item.get("c") + try: + res = self.mapping[type_](value) + except KeyError as exc: + msg = ( + f"The parsed document contains '{type_}', which is not " + "supported yet, please open an issue on GitHub" + ) + raise ValueError(msg) from exc + + # recursively call until the value has been resolved into a str + return self.__call__(res) diff --git a/skops/card/_model_card.py b/skops/card/_model_card.py index 0e180336..ae3d7af3 100644 --- a/skops/card/_model_card.py +++ b/skops/card/_model_card.py @@ -687,6 +687,11 @@ def _add_single(self, key: str, val: Formattable | str) -> Section: val: str or Formattable The value to assign to the (sub)section. + Returns + ------- + Section instance + The section that has been added or modified. + """ *subsection_names, leaf_node_name = split_subsection_names(key) section = self._select(subsection_names) diff --git a/skops/card/_parser.py b/skops/card/_parser.py new file mode 100644 index 00000000..86d4f120 --- /dev/null +++ b/skops/card/_parser.py @@ -0,0 +1,344 @@ +"""Contains the PandocParser + +This class needs to know about the pandoc parse tree but should not have +knowledge of any particular markup syntex; everything related to markup should +be known by the mapping attribute. + +""" + +from __future__ import annotations + +import json +import subprocess +from pathlib import Path +from tempfile import mkdtemp +from typing import Any, Literal + +import yaml # type: ignore +from packaging.version import Version + +from skops.card import Card +from skops.card._model_card import Section + +from ._markup import Markdown, PandocItem + +PANDOC_MIN_VERSION = "2.9.0" + + +class PandocParser: + """Create model cards from files parsed through pandoc. + + This class knows about the implementation details of the + :class:`~skops.card.Card` and generates it by initializing an empty class + and then calling its methods with the input provided by pandoc. + + ``PandocParser`` does not know about any specific markup type, such as + markdown. Instead, it is initialized with a ``Mapping``, which is + responsible to convert pandoc input into the desired markup language. + + After initializing this class, call + :meth:`~skops.card._parser.PandocParser.generate` to generate the resulting + :class:`~skops.card.Card` instance. + + Parameters + ---------- + source : str + The model card parsed using the ``pandoc -t json`` option. + + markup_type : "markdown" + The type of markup that was used for the model card. Right now, only + ``"markdown"`` is supported. + + """ + + def __init__( + self, source: str, markup_type: Literal["markdown"] = "markdown" + ) -> None: + self.source = source + if markup_type.lower() == "markdown": + self.mapping = Markdown() + else: + raise ValueError(f"Markup of type {markup_type} is not supported (yet)") + + def _add_section( + self, section_name: str, card: Card, section_trace: list[str] + ) -> Section: + # Add a new section to the card, which can be a subsection, and return + # it. + section_name = "/".join(section_trace) + cur_section = card._add_single(section_name, "") + return cur_section + + def _add_content(self, content: str, section: Section | None) -> None: + # Add content to the current section + if section is None: + # This may occur if the model card starts without a section. This is + # not illegal in markdown, but we don't handle it yet. + raise ValueError( + "Trying to add content but there is no current section, " + "this is probably a bug, please open an issue on GitHub." + ) + + if not section.content: + section.content = content + elif isinstance(section.content, str): + section.content = section.content + "\n\n" + content + else: # pragma: no cover + # TODO: Content is a Formattable, no generic way to modify it -- + # should we require each Formattable to have an update method? + raise ValueError(f"Could not modify content of {section.content}") + + def _parse_header( + self, item: PandocItem, section_trace: list[str] + ) -> tuple[str, int]: + # Headers are the only type of item that needs to be handled + # differently. This is because we structure the underlying model card + # data as a tree with nodes corresponding to headers. To assign the + # right parent or child node, we need to keep track of the level of the + # headers. This cannot be done on the level of the markdown mapping, + # since it is not aware of the tree structure. + level, _, _ = item["c"] + content = self.mapping(item) + return content, level + + def _post_process(self, res: str) -> str: + # replace Latin1 space + res = res.replace("\xa0", " ") + + # pandoc creates ☒ and ☐ for to do items but GitHub requires [x] and [ ] + # for an item to be considered a to do item + res = res.replace("- ☒", "- [x]").replace("- ☐", "- [ ]") + return res + + def generate(self) -> Card: + """Generate the model card instance from the parsed card. + + Returns + ------- + card : :class:`~skops.card.Card` + The parsed model card instance. If not further modified, the output + of saving that card should be (almost) identical to the initial + model card. + """ + section: Section | None = None + section_trace: list[str] = [] + card = Card(None, template=None) + + # Parsing the flat structure, not recursively as in pandocfilters. After + # visiting the parent node, it's not necessary to visit its child nodes, + # because the mapping class already takes care of visiting the child + # nodes. + for item in json.loads(self.source)["blocks"]: + if item["t"] == "Header": + content, level = self._parse_header(item, section_trace=section_trace) + res = self._post_process(content) + section_trace = section_trace[: level - 1] + [res] + section = self._add_section(res, card=card, section_trace=section_trace) + else: + res = self._post_process(self.mapping(item)) + self._add_content(res, section=section) + + return card + + +def _get_pandoc_version() -> str: + """Shell out to retrieve the pandoc version + + Raises + ------ + RuntimeError + If pandoc version could not be determined, raise a ``RuntimeError``. + + Returns + ------- + pandoc_version : str + The pandoc version as a list of ints. + """ + proc = subprocess.run( + ["pandoc", "--version"], + capture_output=True, + ) + version_info = str(proc.stdout.decode("utf-8")).split("\n", 1)[0] + if not version_info.startswith("pandoc "): + # pandoc is installed but version cannot be determined + raise RuntimeError("Could not determine version of pandoc.") + + _, _, pandoc_version = version_info.partition(" ") + return pandoc_version + + +def check_pandoc_installed( + min_version: str | None = PANDOC_MIN_VERSION, +) -> None: + """Check if pandoc is installed on the system + + Parameters + ---------- + min_version : list[int] or None + If passed, check that the pandoc version is greater or equal to this one. + + Raises + ------ + FileNotFoundError + When the binary is not found, raise this error. + + RuntimeError + If pandoc version could not be determined, raise a ``RuntimeError``. + + ValueError + If min version is passed and not matched or exceeded, raise a ``ValueError``. + """ + try: + pandoc_version = _get_pandoc_version() + except FileNotFoundError as exc: + # pandoc is not installed + msg = ( + "This feature requires the pandoc library to be installed on your system, " + "please follow these install instructions: " + "https://pandoc.org/installing.html." + ) + raise FileNotFoundError(msg) from exc + + if not min_version: + return + + if Version(pandoc_version) < Version(min_version): + raise ValueError( + f"Pandoc version too low, expected at least {min_version}, " + f"got {pandoc_version} instead." + ) + + +def _card_with_detached_metainfo(path: str | Path) -> tuple[str | Path, dict[str, Any]]: + """Detach the possibly existing yaml part of the model card + + Model cards always have a markdown part and optionally a yaml part at the + head, delimited by "---". Obviously, pandoc cannot parse that. Therefore, we + detach the yaml part and return it as a separate dict, only leaving + (hopefully) valid markdown. + + path : str or pathlib.Path + The path to the model card file. + + Returns + ------- + file : path + The path to the model card without any yaml metainfo. If the model card + didn't contain that metainfo to begin with, this is just the path to the + original model card. If it did contain metainfo, this is a path to a new + temporary file with the metainfo removed. + + metainfo : dict + The metainfo from the yaml part as a parsed dict. If no metainfo was + present, the dict is empty. + """ + with open(path, "r") as f: + text = f.read() + + sep_start, sep_end = "---\n", "\n---" + + metainfo: dict[str, Any] = {} + if not text.startswith(sep_start): # no metainfo: + return path, metainfo + + idx_separator = text.find(sep_end) + if idx_separator < len(sep_start): # pragma: no cover + # separator shouldn't come earlier than this + return path, metainfo + + # https://black.readthedocs.io/en/stable/faq.html#why-are-flake8-s-e203-and-w503-violated + text_clean = text[idx_separator + len(sep_end) :] # noqa: E203 + metainfo = yaml.safe_load( # type: ignore + text[len(sep_start) : idx_separator] # noqa: E203 + ) + + file = Path(mkdtemp()) / "tmp-model-card.md" + with open(file, "w") as f: + f.write(text_clean) + return file, metainfo + + +def parse_modelcard(path: str | Path) -> Card: + """Read a model card and return a Card object + + This allows users to load a dumped model card and continue to edit it. + + Using this function requires ``pandoc`` to be installed. Please follow these + instructions: + + https://pandoc.org/installing.html + + Examples + -------- + >>> import numpy as np + >>> from sklearn.linear_model import LinearRegression + >>> from skops.card import Card + >>> from skops.card import parse_modelcard + >>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) + >>> y = np.dot(X, np.array([1, 2])) + 3 + >>> regr = LinearRegression().fit(X, y) + >>> card = Card(regr) + >>> card.save("README.md") # doctest: +SKIP + >>> # later, load the card again + >>> parsed_card = parse_modelcard("README.md") # doctest: +SKIP + >>> # continue editing the card + >>> parsed_card.add(**{"My new section": "My new content"}) # doctest: +SKIP + Card(...) + >>> # overwrite old card with new one + >>> parsed_card.save("README.md") # doctest: +SKIP + + Notes + ----- + There are some **known limitations** to the parser that may result in the + model card generated from the parsed file not being 100% identical to the + original model card: + + - In markdown, bold and italic text can be encoded in different fashions, + e.g. ``_like this_`` or ``*like this*`` for italic text. Pandoc doesn't + differentiate between the two. Therefore, the output may use one method + where the original card used the other. When rendered, the two results + should, however, be the same. + - Table alignment may be different. At the moment, skops does not make use + of column alignment information in tables, so that may differ. + - Quote symbols may differ, e.g. ``it’s`` becoming ``it's``. + - The number of empty lines may differ, e.g. two empty lines being + transformed into one empty line. + - The optional title of links is not preserved, as e.g. in + `[text](https://example.com "this disappears")` + - Trailing whitespace is removed. + - Tab indentation may be removed, e.g. in raw html. + - The yaml part of the model card can have some non-semantic differences, + like omitting optional quotation marks. + + For these reasons, please don't expect the output of a parsed card to be + 100% identical to the original input. However, none of the listed changes + makes any _semantic_ difference. If you find that there is a semantic + difference in the output, please open an issue on GitHub. + + Parameters + ---------- + path : str or pathlib.Path + The path to the existing model card. + + Returns + ------- + card : skops.card.Card + The model card object. + + """ + check_pandoc_installed() + + path, metainfo = _card_with_detached_metainfo(path) + + proc = subprocess.run( + ["pandoc", "-t", "json", "-s", str(path)], + capture_output=True, + ) + source = str(proc.stdout.decode("utf-8")) + + parser = PandocParser(source) + card = parser.generate() + for key, val in metainfo.items(): + setattr(card.metadata, key, val) + + return card diff --git a/skops/card/tests/examples/bert-base-uncased.md b/skops/card/tests/examples/bert-base-uncased.md new file mode 100644 index 00000000..e762fc35 --- /dev/null +++ b/skops/card/tests/examples/bert-base-uncased.md @@ -0,0 +1,249 @@ +--- +language: en +tags: +- exbert +license: apache-2.0 +datasets: +- bookcorpus +- wikipedia +--- + +# BERT base model (uncased) + + + +Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in +[this paper](https://arxiv.org/abs/1810.04805) and first released in +[this repository](https://github.com/google-research/bert). This model is uncased: it does not make a difference +between english and English. + +Disclaimer: The team releasing BERT did not write a model card for this model so this model card has been written by +the Hugging Face team. + +## Model description + +BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it +was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of +publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it +was pretrained with two objectives: + +- Masked language modeling (MLM): taking a sentence, the model randomly masks 15% of the words in the input then run + the entire masked sentence through the model and has to predict the masked words. This is different from traditional + recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like + GPT which internally masks the future tokens. It allows the model to learn a bidirectional representation of the + sentence. +- Next sentence prediction (NSP): the models concatenates two masked sentences as inputs during pretraining. Sometimes + they correspond to sentences that were next to each other in the original text, sometimes not. The model then has to + predict if the two sentences were following each other or not. + +This way, the model learns an inner representation of the English language that can then be used to extract features +useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard +classifier using the features produced by the BERT model as inputs. + +## Model variations + +BERT has originally been released in base and large variations, for cased and uncased input text. The uncased models also strips out an accent markers. +Chinese and multilingual uncased and cased versions followed shortly after. +Modified preprocessing with whole word masking has replaced subpiece masking in a following work, with the release of two models. +Other 24 smaller models are released afterward. + +The detailed release history can be found on the [google-research/bert readme](https://github.com/google-research/bert/blob/master/README.md) on github. + +| Model | #params | Language | +|------------------------|--------------------------------|-------| +| [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M | English | +| [`bert-large-uncased`](https://huggingface.co/bert-large-uncased) | 340M | English | sub +| [`bert-base-cased`](https://huggingface.co/bert-base-cased) | 110M | English | +| [`bert-large-cased`](https://huggingface.co/bert-large-cased) | 340M | English | +| [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 110M | Chinese | +| [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple | +| [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English | +| [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English | + +## Intended uses & limitations + +You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to +be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?filter=bert) to look for +fine-tuned versions of a task that interests you. + +Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) +to make decisions, such as sequence classification, token classification or question answering. For tasks such as text +generation you should look at model like GPT2. + +### How to use + +You can use this model directly with a pipeline for masked language modeling: + +```python +>>> from transformers import pipeline +>>> unmasker = pipeline('fill-mask', model='bert-base-uncased') +>>> unmasker("Hello I'm a [MASK] model.") +[{'sequence': "[CLS] hello i'm a fashion model. [SEP]", + 'score': 0.1073106899857521, + 'token': 4827, + 'token_str': 'fashion'}, + {'sequence': "[CLS] hello i'm a role model. [SEP]", + 'score': 0.08774490654468536, + 'token': 2535, + 'token_str': 'role'}, + {'sequence': "[CLS] hello i'm a new model. [SEP]", + 'score': 0.05338378623127937, + 'token': 2047, + 'token_str': 'new'}, + {'sequence': "[CLS] hello i'm a super model. [SEP]", + 'score': 0.04667217284440994, + 'token': 3565, + 'token_str': 'super'}, + {'sequence': "[CLS] hello i'm a fine model. [SEP]", + 'score': 0.027095865458250046, + 'token': 2986, + 'token_str': 'fine'}] +``` + +Here is how to use this model to get the features of a given text in PyTorch: + +```python +from transformers import BertTokenizer, BertModel +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') +model = BertModel.from_pretrained("bert-base-uncased") +text = "Replace me by any text you'd like." +encoded_input = tokenizer(text, return_tensors='pt') +output = model(**encoded_input) +``` + +and in TensorFlow: + +```python +from transformers import BertTokenizer, TFBertModel +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') +model = TFBertModel.from_pretrained("bert-base-uncased") +text = "Replace me by any text you'd like." +encoded_input = tokenizer(text, return_tensors='tf') +output = model(encoded_input) +``` + +### Limitations and bias + +Even if the training data used for this model could be characterized as fairly neutral, this model can have biased +predictions: + +```python +>>> from transformers import pipeline +>>> unmasker = pipeline('fill-mask', model='bert-base-uncased') +>>> unmasker("The man worked as a [MASK].") +[{'sequence': '[CLS] the man worked as a carpenter. [SEP]', + 'score': 0.09747550636529922, + 'token': 10533, + 'token_str': 'carpenter'}, + {'sequence': '[CLS] the man worked as a waiter. [SEP]', + 'score': 0.0523831807076931, + 'token': 15610, + 'token_str': 'waiter'}, + {'sequence': '[CLS] the man worked as a barber. [SEP]', + 'score': 0.04962705448269844, + 'token': 13362, + 'token_str': 'barber'}, + {'sequence': '[CLS] the man worked as a mechanic. [SEP]', + 'score': 0.03788609802722931, + 'token': 15893, + 'token_str': 'mechanic'}, + {'sequence': '[CLS] the man worked as a salesman. [SEP]', + 'score': 0.037680890411138535, + 'token': 18968, + 'token_str': 'salesman'}] +>>> unmasker("The woman worked as a [MASK].") +[{'sequence': '[CLS] the woman worked as a nurse. [SEP]', + 'score': 0.21981462836265564, + 'token': 6821, + 'token_str': 'nurse'}, + {'sequence': '[CLS] the woman worked as a waitress. [SEP]', + 'score': 0.1597415804862976, + 'token': 13877, + 'token_str': 'waitress'}, + {'sequence': '[CLS] the woman worked as a maid. [SEP]', + 'score': 0.1154729500412941, + 'token': 10850, + 'token_str': 'maid'}, + {'sequence': '[CLS] the woman worked as a prostitute. [SEP]', + 'score': 0.037968918681144714, + 'token': 19215, + 'token_str': 'prostitute'}, + {'sequence': '[CLS] the woman worked as a cook. [SEP]', + 'score': 0.03042375110089779, + 'token': 5660, + 'token_str': 'cook'}] +``` + +This bias will also affect all fine-tuned versions of this model. + +## Training data + +The BERT model was pretrained on [BookCorpus](https://yknzhu.wixsite.com/mbweb), a dataset consisting of 11,038 +unpublished books and [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia) (excluding lists, tables and +headers). + +## Training procedure + +### Preprocessing + +The texts are lowercased and tokenized using WordPiece and a vocabulary size of 30,000. The inputs of the model are +then of the form: + +``` +[CLS] Sentence A [SEP] Sentence B [SEP] +``` + +With probability 0.5, sentence A and sentence B correspond to two consecutive sentences in the original corpus, and in +the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a +consecutive span of text usually longer than a single sentence. The only constrain is that the result with the two +"sentences" has a combined length of less than 512 tokens. + +The details of the masking procedure for each sentence are the following: +- 15% of the tokens are masked. +- In 80% of the cases, the masked tokens are replaced by `[MASK]`. +- In 10% of the cases, the masked tokens are replaced by a random token (different) from the one they replace. +- In the 10% remaining cases, the masked tokens are left as is. + +### Pretraining + +The model was trained on 4 cloud TPUs in Pod configuration (16 TPU chips total) for one million steps with a batch size +of 256. The sequence length was limited to 128 tokens for 90% of the steps and 512 for the remaining 10%. The optimizer +used is Adam with a learning rate of 1e-4, \\(\beta_{1} = 0.9\\) and \\(\beta_{2} = 0.999\\), a weight decay of 0.01, +learning rate warmup for 10,000 steps and linear decay of the learning rate after. + +## Evaluation results + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | +|:----:|:-----------:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:-------:| +| | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | + + +### BibTeX entry and citation info + +```bibtex +@article{DBLP:journals/corr/abs-1810-04805, + author = {Jacob Devlin and + Ming{-}Wei Chang and + Kenton Lee and + Kristina Toutanova}, + title = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language + Understanding}, + journal = {CoRR}, + volume = {abs/1810.04805}, + year = {2018}, + url = {http://arxiv.org/abs/1810.04805}, + archivePrefix = {arXiv}, + eprint = {1810.04805}, + timestamp = {Tue, 30 Oct 2018 20:39:56 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-1810-04805.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` + + + + diff --git a/skops/card/tests/examples/bert-base-uncased.md.diff b/skops/card/tests/examples/bert-base-uncased.md.diff new file mode 100644 index 00000000..2367a8d8 --- /dev/null +++ b/skops/card/tests/examples/bert-base-uncased.md.diff @@ -0,0 +1,40 @@ +--- ++++ +@@ -44,10 +44,10 @@ +-| Model | #params | Language | +-|------------------------|--------------------------------|-------| +-| [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M | English | +-| [`bert-large-uncased`](https://huggingface.co/bert-large-uncased) | 340M | English | sub +-| [`bert-base-cased`](https://huggingface.co/bert-base-cased) | 110M | English | +-| [`bert-large-cased`](https://huggingface.co/bert-large-cased) | 340M | English | +-| [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 110M | Chinese | +-| [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple | +-| [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English | +-| [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English | ++| Model | #params | Language | ++|---------------------------------------------------------------------------------------------------------|-----------|------------| ++| [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M | English | ++| [`bert-large-uncased`](https://huggingface.co/bert-large-uncased) | 340M | English | ++| [`bert-base-cased`](https://huggingface.co/bert-base-cased) | 110M | English | ++| [`bert-large-cased`](https://huggingface.co/bert-large-cased) | 340M | English | ++| [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 110M | Chinese | ++| [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple | ++| [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English | ++| [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English | +@@ -57 +57 @@ +-You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to ++You can use the raw model for either masked language modeling or next sentence prediction, but it’s mostly intended to +@@ -189 +189 @@ +-the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a ++the other cases, it’s another random sentence in the corpus. Note that what is considered a sentence here is a +@@ -212,4 +212,3 @@ +-| Task | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | +-|:----:|:-----------:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:-------:| +-| | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | +- ++| Task | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | ++|--------|---------------|-------|--------|---------|--------|---------|--------|-------|-----------| ++| | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | +@@ -240 +239 @@ +- ++ diff --git a/skops/card/tests/examples/clip-vit-large-patch14.md b/skops/card/tests/examples/clip-vit-large-patch14.md new file mode 100644 index 00000000..cbbfa909 --- /dev/null +++ b/skops/card/tests/examples/clip-vit-large-patch14.md @@ -0,0 +1,147 @@ +--- +tags: +- vision +widget: +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png + candidate_labels: playing music, playing sports + example_title: Cat & Dog +--- + +# Model Card: CLIP + + + +Disclaimer: The model card is taken and modified from the official CLIP repository, it can be found [here](https://github.com/openai/CLIP/blob/main/model-card.md). + +## Model Details + +The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within. + +### Model Date + +January 2021 + +### Model Type + +The base model uses a ViT-L/14 Transformer architecture as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. + +The original implementation had two variants: one using a ResNet image encoder and the other using a Vision Transformer. This repository has the variant with the Vision Transformer. + + +### Documents + +- [Blog Post](https://openai.com/blog/clip/) +- [CLIP Paper](https://arxiv.org/abs/2103.00020) + + +### Use with Transformers + +```python +from PIL import Image +import requests + +from transformers import CLIPProcessor, CLIPModel + +model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14") +processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") + +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) + +inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True) + +outputs = model(**inputs) +logits_per_image = outputs.logits_per_image # this is the image-text similarity score +probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities +``` + + +## Model Use + +### Intended Use + +The model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such models - the CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis. + +#### Primary intended uses + +The primary intended users of these models are AI researchers. + +We primarily imagine the model will be used by researchers to better understand robustness, generalization, and other capabilities, biases, and constraints of computer vision models. + +### Out-of-Scope Use Cases + +**Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful. + +Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use. + +Since the model has not been purposefully trained in or evaluated on any languages other than English, its use should be limited to English language use cases. + + + +## Data + +The model was trained on publicly available image-caption data. This was done through a combination of crawling a handful of websites and using commonly-used pre-existing image datasets such as [YFCC100M](http://projects.dfki.uni-kl.de/yfcc100m/). A large portion of the data comes from our crawling of the internet. This means that the data is more representative of people and societies most connected to the internet which tend to skew towards more developed nations, and younger, male users. + +### Data Mission Statement + +Our goal with building this dataset was to test out robustness and generalizability in computer vision tasks. As a result, the focus was on gathering large quantities of data from different publicly-available internet data sources. The data was gathered in a mostly non-interventionist manner. However, we only crawled websites that had policies against excessively violent and adult images and allowed us to filter out such content. We do not intend for this dataset to be used as the basis for any commercial or deployed model and will not be releasing the dataset. + + + +## Performance and Limitations + +### Performance + +We have evaluated the performance of CLIP on a wide range of benchmarks across a variety of computer vision datasets such as OCR to texture recognition to fine-grained classification. The paper describes model performance on the following datasets: + +- Food101 +- CIFAR10 +- CIFAR100 +- Birdsnap +- SUN397 +- Stanford Cars +- FGVC Aircraft +- VOC2007 +- DTD +- Oxford-IIIT Pet dataset +- Caltech101 +- Flowers102 +- MNIST +- SVHN +- IIIT5K +- Hateful Memes +- SST-2 +- UCF101 +- Kinetics700 +- Country211 +- CLEVR Counting +- KITTI Distance +- STL-10 +- RareAct +- Flickr30 +- MSCOCO +- ImageNet +- ImageNet-A +- ImageNet-R +- ImageNet Sketch +- ObjectNet (ImageNet Overlap) +- Youtube-BB +- ImageNet-Vid + +## Limitations + +CLIP and our analysis of it have a number of limitations. CLIP currently struggles with respect to certain tasks such as fine grained classification and counting objects. CLIP also poses issues with regards to fairness and bias which we discuss in the paper and briefly in the next section. Additionally, our approach to testing CLIP also has an important limitation- in many cases we have used linear probes to evaluate the performance of CLIP and there is evidence suggesting that linear probes can underestimate model performance. + +### Bias and Fairness + +We find that the performance of CLIP - and the specific biases it exhibits - can depend significantly on class design and the choices one makes for categories to include and exclude. We tested the risk of certain kinds of denigration with CLIP by classifying images of people from [Fairface](https://arxiv.org/abs/1908.04913) into crime-related and non-human animal categories. We found significant disparities with respect to race and gender. Additionally, we found that these disparities could shift based on how the classes were constructed. (Details captured in the Broader Impacts Section in the paper). + +We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. + + + +## Feedback + +### Where to send questions or comments about the model + +Please use [this Google Form](https://forms.gle/Uv7afRH5dvY34ZEs9) diff --git a/skops/card/tests/examples/clip-vit-large-patch14.md.diff b/skops/card/tests/examples/clip-vit-large-patch14.md.diff new file mode 100644 index 00000000..8516f684 --- /dev/null +++ b/skops/card/tests/examples/clip-vit-large-patch14.md.diff @@ -0,0 +1,19 @@ +--- ++++ +@@ -23 +22,0 @@ +- +@@ -28 +26,0 @@ +- +@@ -51 +48,0 @@ +- +@@ -72,2 +68,0 @@ +- +- +@@ -81,2 +75,0 @@ +- +- +@@ -132,3 +125 @@ +-We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. +- +- ++We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with 'Middle Eastern' having the highest accuracy (98.4%) and 'White' having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. diff --git a/skops/card/tests/examples/gpt2.md b/skops/card/tests/examples/gpt2.md new file mode 100644 index 00000000..6481d600 --- /dev/null +++ b/skops/card/tests/examples/gpt2.md @@ -0,0 +1,168 @@ +--- +language: en +tags: +- exbert + +license: mit +--- + +# GPT-2 + + + +Test the whole generation capabilities here: https://transformer.huggingface.co/doc/gpt2-large + +Pretrained model on English language using a causal language modeling (CLM) objective. It was introduced in +[this paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) +and first released at [this page](https://openai.com/blog/better-language-models/). + +Disclaimer: The team releasing GPT-2 also wrote a +[model card](https://github.com/openai/gpt-2/blob/master/model_card.md) for their model. Content from this model card +has been written by the Hugging Face team to complete the information they provided and give specific examples of bias. + +## Model description + +GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This +means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots +of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, +it was trained to guess the next word in sentences. + +More precisely, inputs are sequences of continuous text of a certain length and the targets are the same sequence, +shifted one token (word or piece of word) to the right. The model uses internally a mask-mechanism to make sure the +predictions for the token `i` only uses the inputs from `1` to `i` but not the future tokens. + +This way, the model learns an inner representation of the English language that can then be used to extract features +useful for downstream tasks. The model is best at what it was pretrained for however, which is generating texts from a +prompt. + +This is the **smallest** version of GPT-2, with 124M parameters. + +**Related Models:** [GPT-Large](https://huggingface.co/gpt2-large), [GPT-Medium](https://huggingface.co/gpt2-medium) and [GPT-XL](https://huggingface.co/gpt2-xl) + +## Intended uses & limitations + +You can use the raw model for text generation or fine-tune it to a downstream task. See the +[model hub](https://huggingface.co/models?filter=gpt2) to look for fine-tuned versions on a task that interests you. + +### How to use + +You can use this model directly with a pipeline for text generation. Since the generation relies on some randomness, we +set a seed for reproducibility: + +```python +>>> from transformers import pipeline, set_seed +>>> generator = pipeline('text-generation', model='gpt2') +>>> set_seed(42) +>>> generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5) + +[{'generated_text': "Hello, I'm a language model, a language for thinking, a language for expressing thoughts."}, + {'generated_text': "Hello, I'm a language model, a compiler, a compiler library, I just want to know how I build this kind of stuff. I don"}, + {'generated_text': "Hello, I'm a language model, and also have more than a few of your own, but I understand that they're going to need some help"}, + {'generated_text': "Hello, I'm a language model, a system model. I want to know my language so that it might be more interesting, more user-friendly"}, + {'generated_text': 'Hello, I\'m a language model, not a language model"\n\nThe concept of "no-tricks" comes in handy later with new'}] +``` + +Here is how to use this model to get the features of a given text in PyTorch: + +```python +from transformers import GPT2Tokenizer, GPT2Model +tokenizer = GPT2Tokenizer.from_pretrained('gpt2') +model = GPT2Model.from_pretrained('gpt2') +text = "Replace me by any text you'd like." +encoded_input = tokenizer(text, return_tensors='pt') +output = model(**encoded_input) +``` + +and in TensorFlow: + +```python +from transformers import GPT2Tokenizer, TFGPT2Model +tokenizer = GPT2Tokenizer.from_pretrained('gpt2') +model = TFGPT2Model.from_pretrained('gpt2') +text = "Replace me by any text you'd like." +encoded_input = tokenizer(text, return_tensors='tf') +output = model(encoded_input) +``` + +### Limitations and bias + +The training data used for this model has not been released as a dataset one can browse. We know it contains a lot of +unfiltered content from the internet, which is far from neutral. As the openAI team themselves point out in their +[model card](https://github.com/openai/gpt-2/blob/master/model_card.md#out-of-scope-use-cases): + +> Because large-scale language models like GPT-2 do not distinguish fact from fiction, we don’t support use-cases +> that require the generated text to be true. +> +> Additionally, language models like GPT-2 reflect the biases inherent to the systems they were trained on, so we do +> not recommend that they be deployed into systems that interact with humans > unless the deployers first carry out a +> study of biases relevant to the intended use-case. We found no statistically significant difference in gender, race, +> and religious bias probes between 774M and 1.5B, implying all versions of GPT-2 should be approached with similar +> levels of caution around use cases that are sensitive to biases around human attributes. + +Here's an example of how the model can have biased predictions: + +```python +>>> from transformers import pipeline, set_seed +>>> generator = pipeline('text-generation', model='gpt2') +>>> set_seed(42) +>>> generator("The White man worked as a", max_length=10, num_return_sequences=5) + +[{'generated_text': 'The White man worked as a mannequin for'}, + {'generated_text': 'The White man worked as a maniser of the'}, + {'generated_text': 'The White man worked as a bus conductor by day'}, + {'generated_text': 'The White man worked as a plumber at the'}, + {'generated_text': 'The White man worked as a journalist. He had'}] + +>>> set_seed(42) +>>> generator("The Black man worked as a", max_length=10, num_return_sequences=5) + +[{'generated_text': 'The Black man worked as a man at a restaurant'}, + {'generated_text': 'The Black man worked as a car salesman in a'}, + {'generated_text': 'The Black man worked as a police sergeant at the'}, + {'generated_text': 'The Black man worked as a man-eating monster'}, + {'generated_text': 'The Black man worked as a slave, and was'}] +``` + +This bias will also affect all fine-tuned versions of this model. + +## Training data + +The OpenAI team wanted to train this model on a corpus as large as possible. To build it, they scraped all the web +pages from outbound links on Reddit which received at least 3 karma. Note that all Wikipedia pages were removed from +this dataset, so the model was not trained on any part of Wikipedia. The resulting dataset (called WebText) weights +40GB of texts but has not been publicly released. You can find a list of the top 1,000 domains present in WebText +[here](https://github.com/openai/gpt-2/blob/master/domains.txt). + +## Training procedure + +### Preprocessing + +The texts are tokenized using a byte-level version of Byte Pair Encoding (BPE) (for unicode characters) and a +vocabulary size of 50,257. The inputs are sequences of 1024 consecutive tokens. + +The larger model was trained on 256 cloud TPU v3 cores. The training duration was not disclosed, nor were the exact +details of training. + +## Evaluation results + +The model achieves the following results without any fine-tuning (zero-shot): + +| Dataset | LAMBADA | LAMBADA | CBT-CN | CBT-NE | WikiText2 | PTB | enwiki8 | text8 | WikiText103 | 1BW | +|:--------:|:-------:|:-------:|:------:|:------:|:---------:|:------:|:-------:|:------:|:-----------:|:-----:| +| (metric) | (PPL) | (ACC) | (ACC) | (ACC) | (PPL) | (PPL) | (BPB) | (BPC) | (PPL) | (PPL) | +| | 35.13 | 45.99 | 87.65 | 83.4 | 29.41 | 65.85 | 1.16 | 1,17 | 37.50 | 75.20 | + + +### BibTeX entry and citation info + +```bibtex +@article{radford2019language, + title={Language Models are Unsupervised Multitask Learners}, + author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya}, + year={2019} +} +``` + + + + diff --git a/skops/card/tests/examples/gpt2.md.diff b/skops/card/tests/examples/gpt2.md.diff new file mode 100644 index 00000000..ee0d38f1 --- /dev/null +++ b/skops/card/tests/examples/gpt2.md.diff @@ -0,0 +1,20 @@ +--- ++++ +@@ -89 +88,0 @@ +-> +@@ -96 +95 @@ +-Here's an example of how the model can have biased predictions: ++Here’s an example of how the model can have biased predictions: +@@ -144,5 +143,4 @@ +-| Dataset | LAMBADA | LAMBADA | CBT-CN | CBT-NE | WikiText2 | PTB | enwiki8 | text8 | WikiText103 | 1BW | +-|:--------:|:-------:|:-------:|:------:|:------:|:---------:|:------:|:-------:|:------:|:-----------:|:-----:| +-| (metric) | (PPL) | (ACC) | (ACC) | (ACC) | (PPL) | (PPL) | (BPB) | (BPC) | (PPL) | (PPL) | +-| | 35.13 | 45.99 | 87.65 | 83.4 | 29.41 | 65.85 | 1.16 | 1,17 | 37.50 | 75.20 | +- ++| Dataset | LAMBADA | CBT-CN | CBT-NE | WikiText2 | PTB | enwiki8 | text8 | WikiText103 | 1BW | ++|-----------|-----------|----------|----------|-------------|-------|-----------|---------|---------------|-------| ++| (metric) | (ACC) | (ACC) | (ACC) | (PPL) | (PPL) | (BPB) | (BPC) | (PPL) | (PPL) | ++| | 45.99 | 87.65 | 83.4 | 29.41 | 65.85 | 1.16 | 1,17 | 37.50 | 75.20 | +@@ -161 +159 @@ +- ++ diff --git a/skops/card/tests/examples/specter.md b/skops/card/tests/examples/specter.md new file mode 100644 index 00000000..e53e77ce --- /dev/null +++ b/skops/card/tests/examples/specter.md @@ -0,0 +1,26 @@ +--- +language: en +thumbnail: "https://camo.githubusercontent.com/7d080b7a769f7fdf64ac0ebeb47b039cb50be35287e3071f9d633f0fe33e7596/68747470733a2f2f692e6962622e636f2f33544331576d472f737065637465722d6c6f676f2d63726f707065642e706e67" +license: apache-2.0 +datasets: +- SciDocs +metrics: +- F1 +- accuracy +- map +- ndcg +--- + +## SPECTER + + + +SPECTER is a pre-trained language model to generate document-level embedding of documents. It is pre-trained on a a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, SPECTER can be easily applied to downstream applications without task-specific fine-tuning. + +Paper: [SPECTER: Document-level Representation Learning using Citation-informed Transformers](https://arxiv.org/pdf/2004.07180.pdf) + +Original Repo: [Github](https://github.com/allenai/specter) + +Evaluation Benchmark: [SciDocs](https://github.com/allenai/scidocs) + +Authors: *Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld* diff --git a/skops/card/tests/examples/specter.md.diff b/skops/card/tests/examples/specter.md.diff new file mode 100644 index 00000000..647c85a8 --- /dev/null +++ b/skops/card/tests/examples/specter.md.diff @@ -0,0 +1,8 @@ +--- ++++ +@@ -3 +3 @@ +-## SPECTER ++# SPECTER +@@ -15 +15 @@ +-Authors: *Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld* ++Authors: _Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld_ diff --git a/skops/card/tests/examples/toy-example.md b/skops/card/tests/examples/toy-example.md new file mode 100644 index 00000000..44669f11 --- /dev/null +++ b/skops/card/tests/examples/toy-example.md @@ -0,0 +1,182 @@ +# This document tries to cover many common markdown contents + +This is not based on an existing model card and serves to increase test coverage. It also documents differences that may be found after parsing. There is no metainfo section. + +## H2 + +### H3 + +#### H4 + +##### H5 + +###### H6 + +Parser 'preserves' some "quotation" marks. + +Parser doesn’t ‘preserve’ other “quotation” marks. + +## Italics + +One _way_ of doing it. +Another *way* of doing it. + +## Bold + +One __way__ of doing it. +Another **way** of doing it. + +## Strikethrough + +This is ~~not~~ the way. + +## Superscript and subscripts + +Really just html tags. + +E = mc2 + +log2 + +## Bullet lists + +Pandoc does not differentiate between different notations, so we always use -, not * or +. + +* using +* asterisk + +or + +- using +- minus + with line break + +or + ++ using plus + +Finally: + +- nesting + - is +- indeed + - very + - possible + - to achieve + +## Ordered lists + +1. a normal +2. ordered list + +or + +1. an ordered +2. list + 1. with + 2. indentation +3. is possible + +## Mixed lists + +1. it’s +2. possible + - to + - mix +3. ordered _and_ unorderd + +## TODOs + +- [x] This +- [ ] is +- [x] **done** + +## Links + +[a link](https://skops.readthedocs.io/) + +The "title" is not parsed by pandoc + +[a link](https://skops.readthedocs.io/ "this disappears") + +[a link to a file](./toy-example.md) + +References are resolved, so `[1]` below is replaced by the actual link: + +[a link with reference][1] + +A plain link to https://skops.readthedocs.io/ used inside of text. + +[1]: https://skops.readthedocs.io/ + +## Images + +![skops logo](https://github.com/skops-dev/skops/blob/main/docs/images/logo.png) + +### Using html + +logo + +## Quotes + +> Someone said something importent + +> I quote wise words: +> > Someone said something importent + +## Tables + +| Header 0 | Header 1 | +|--------------|----------------| +| Some content | More content | +| _Even more_ | This is **it** | + +Empty tables are legal + +| What now? | +|-------------| + +## Inline code + +Some `inline` code. + +`A whole line` + +## Code blocks + +``` +A raw + +code block +``` + +With language + +```python +def foo(): + return 0 + +def bar(): + return 1 +``` + +## Raw HTML + + +
+
Beast of Bodmin
+
A large feline inhabiting Bodmin Moor.
+ +
Morgawr
+
A sea serpent.
+ +
Owlman
+
A giant owl-like creature.
+
+ +## Div + +The "id" tag may change in order +
+

Divs are possible

+
diff --git a/skops/card/tests/examples/toy-example.md.diff b/skops/card/tests/examples/toy-example.md.diff new file mode 100644 index 00000000..9c00ed2a --- /dev/null +++ b/skops/card/tests/examples/toy-example.md.diff @@ -0,0 +1,90 @@ +--- ++++ +@@ -0,0 +1 @@ ++ +@@ -17 +18 @@ +-Parser doesn’t ‘preserve’ other “quotation” marks. ++Parser doesn’t 'preserve' other "quotation" marks. +@@ -22 +23 @@ +-Another *way* of doing it. ++Another _way_ of doing it. +@@ -26 +27 @@ +-One __way__ of doing it. ++One **way** of doing it. +@@ -45,2 +46,2 @@ +-* using +-* asterisk ++- using ++- asterisk +@@ -56 +57 @@ +-+ using plus ++- using plus +@@ -100 +101 @@ +-[a link](https://skops.readthedocs.io/ "this disappears") ++[a link](https://skops.readthedocs.io/) +@@ -106 +107 @@ +-[a link with reference][1] ++[a link with reference](https://skops.readthedocs.io/) +@@ -109,2 +109,0 @@ +- +-[1]: https://skops.readthedocs.io/ +@@ -164 +163,6 @@ +- ++ ++ +@@ -167,8 +171,37 @@ +-
Beast of Bodmin
+-
A large feline inhabiting Bodmin Moor.
+- +-
Morgawr
+-
A sea serpent.
+- +-
Owlman
+-
A giant owl-like creature.
++ ++
++ ++Beast of Bodmin ++ ++
++ ++
++ ++A large feline inhabiting Bodmin Moor. ++ ++
++ ++
++ ++Morgawr ++ ++
++ ++
++ ++A sea serpent. ++ ++
++ ++
++ ++Owlman ++ ++
++ ++
++ ++A giant owl-like creature. ++ ++
++ +@@ -180,3 +213,2 @@ +-
+-

Divs are possible

+-
++ ++

Divs are possible

diff --git a/skops/card/tests/examples/vit-base-patch32-224-in21k.md b/skops/card/tests/examples/vit-base-patch32-224-in21k.md new file mode 100644 index 00000000..570f5916 --- /dev/null +++ b/skops/card/tests/examples/vit-base-patch32-224-in21k.md @@ -0,0 +1,94 @@ +--- +license: apache-2.0 +tags: +- vision +datasets: +- imagenet-21k +inference: false +--- + +# Vision Transformer (base-sized model) + + + +Vision Transformer (ViT) model pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224. It was introduced in the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Dosovitskiy et al. and first released in [this repository](https://github.com/google-research/vision_transformer). However, the weights were converted from the [timm repository](https://github.com/rwightman/pytorch-image-models) by Ross Wightman, who already converted the weights from JAX to PyTorch. Credits go to him. + +Disclaimer: The team releasing ViT did not write a model card for this model so this model card has been written by the Hugging Face team. + +## Model description + +The Vision Transformer (ViT) is a transformer encoder model (BERT-like) pretrained on a large collection of images in a supervised fashion, namely ImageNet-21k, at a resolution of 224x224 pixels. + +Images are presented to the model as a sequence of fixed-size patches (resolution 32x32), which are linearly embedded. One also adds a [CLS] token to the beginning of a sequence to use it for classification tasks. One also adds absolute position embeddings before feeding the sequence to the layers of the Transformer encoder. + +Note that this model does not provide any fine-tuned heads, as these were zero'd by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification). + +By pre-training the model, it learns an inner representation of images that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled images for instance, you can train a standard classifier by placing a linear layer on top of the pre-trained encoder. One typically places a linear layer on top of the [CLS] token, as the last hidden state of this token can be seen as a representation of an entire image. + +## Intended uses & limitations + +You can use the raw model for image classification. See the [model hub](https://huggingface.co/models?search=google/vit) to look for +fine-tuned versions on a task that interests you. + +### How to use + +Here is how to use this model: + +```python +from transformers import ViTFeatureExtractor, ViTModel +from PIL import Image +import requests +url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +image = Image.open(requests.get(url, stream=True).raw) +feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch32-224-in21k') +model = ViTModel.from_pretrained('google/vit-base-patch32-224-in21k') +inputs = feature_extractor(images=image, return_tensors="pt") +outputs = model(**inputs) +last_hidden_state = outputs.last_hidden_state +``` + +Currently, both the feature extractor and model support PyTorch. Tensorflow and JAX/FLAX are coming soon, and the API of ViTFeatureExtractor might change. + +## Training data + +The ViT model was pretrained on [ImageNet-21k](http://www.image-net.org/), a dataset consisting of 14 million images and 21k classes. + +## Training procedure + +### Preprocessing + +The exact details of preprocessing of images during training/validation can be found [here](https://github.com/google-research/vision_transformer/blob/master/vit_jax/input_pipeline.py). + +Images are resized/rescaled to the same resolution (224x224) and normalized across the RGB channels with mean (0.5, 0.5, 0.5) and standard deviation (0.5, 0.5, 0.5). + +### Pretraining + +The model was trained on TPUv3 hardware (8 cores). All model variants are trained with a batch size of 4096 and learning rate warmup of 10k steps. For ImageNet, the authors found it beneficial to additionally apply gradient clipping at global norm 1. Pre-training resolution is 224. + +## Evaluation results + +For evaluation results on several image classification benchmarks, we refer to tables 2 and 5 of the original paper. Note that for fine-tuning, the best results are obtained with a higher resolution (384x384). Of course, increasing the model size will result in better performance. + +### BibTeX entry and citation info + +```bibtex +@misc{wu2020visual, + title={Visual Transformers: Token-based Image Representation and Processing for Computer Vision}, + author={Bichen Wu and Chenfeng Xu and Xiaoliang Dai and Alvin Wan and Peizhao Zhang and Zhicheng Yan and Masayoshi Tomizuka and Joseph Gonzalez and Kurt Keutzer and Peter Vajda}, + year={2020}, + eprint={2006.03677}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +```bibtex +@inproceedings{deng2009imagenet, + title={Imagenet: A large-scale hierarchical image database}, + author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li}, + booktitle={2009 IEEE conference on computer vision and pattern recognition}, + pages={248--255}, + year={2009}, + organization={Ieee} +} +``` diff --git a/skops/card/tests/examples/vit-base-patch32-224-in21k.md.diff b/skops/card/tests/examples/vit-base-patch32-224-in21k.md.diff new file mode 100644 index 00000000..b48c0b73 --- /dev/null +++ b/skops/card/tests/examples/vit-base-patch32-224-in21k.md.diff @@ -0,0 +1,5 @@ +--- ++++ +@@ -17 +17 @@ +-Note that this model does not provide any fine-tuned heads, as these were zero'd by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification). ++Note that this model does not provide any fine-tuned heads, as these were zero’d by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification). diff --git a/skops/card/tests/test_parser.py b/skops/card/tests/test_parser.py new file mode 100644 index 00000000..b74486fe --- /dev/null +++ b/skops/card/tests/test_parser.py @@ -0,0 +1,180 @@ +import difflib +import json +import os +import re +from pathlib import Path +from unittest.mock import Mock, patch + +import pytest +import yaml # type: ignore + +from skops.card import parse_modelcard +from skops.card._parser import PandocParser, check_pandoc_installed + +try: + check_pandoc_installed() +except FileNotFoundError: + # not installed, skip + pytest.skip(reason="These tests require a recent pandoc", allow_module_level=True) + + +EXAMPLE_CARDS = [ + # actual model cards from HF hub + "bert-base-uncased.md", + "clip-vit-large-patch14.md", + "gpt2.md", + "specter.md", + "vit-base-patch32-224-in21k.md", + # not a model card + "toy-example.md", +] + + +def _assert_meta_equal(meta0, meta1): + # we cannot guarantee the order of metadata items, so we compare parsed + # dicts, but not strings directly + assert yaml.safe_load("".join(meta0)) == yaml.safe_load("".join(meta1)) + + +def assert_readme_files_almost_equal(file0, file1, diff): + """Check that the two model cards are identical, but allow differences as + defined in the ``diff`` file + + The metainfo is compared separately, as the order of the items is not + guaranteed to be stable. + """ + with open(file0, "r") as f: + readme0 = f.readlines() + + with open(file1, "r") as f: + readme1 = f.readlines() + + sep = "---\n" + # we look for 2nd occurrence, so skip first char to not match 1st occurrence + if sep in readme0[1:]: # only check if metainfo is present + idx0, idx1 = readme0[1:].index(sep) + 1, readme1[1:].index(sep) + 1 + meta0, meta1 = readme0[1:idx0], readme1[1:idx1] + readme0, readme1 = readme0[idx0:], readme1[idx1:] + _assert_meta_equal(meta0, meta1) + + # exclude trivial case of both being empty + assert readme0 + assert readme1 + + diff_actual = list(difflib.unified_diff(readme0, readme1, n=0)) + + with open(diff, "r") as f: + diff_expected = f.readlines() + + assert diff_actual == diff_expected + + +@pytest.mark.parametrize("file_name", EXAMPLE_CARDS, ids=EXAMPLE_CARDS) +def test_example_model_cards(tmp_path, file_name): + """Test that the difference between original and parsed model card is + acceptable + + For this test, model cards for some of the most popular models on HF Hub + were retrieved and stored in the ./examples folder. This test checks that + these model cards can be successfully parsed and that the output is *almost* + the same. + + We don't expect the output to be 100% identical, see the limitations listed + in ``parse_modelcard``. Instead, we assert that the diff corresponds to the + expected diff, which is also checked in. + + So e.g. for "specter.md", we expect that the diff will be the same diff as + in "specter.md.diff". + + """ + path = Path(os.getcwd()) / "skops" / "card" / "tests" / "examples" + file0 = path / file_name + diff = (path / file_name).with_suffix(".md.diff") + + parsed_card = parse_modelcard(file0) + file1 = tmp_path / "readme-parsed.md" + parsed_card.save(file1) + + assert_readme_files_almost_equal(file0, file1, diff) + + +def test_unknown_pandoc_item_raises(): + source = json.dumps( + { + "pandoc-api-version": [1, 22, 2, 1], + "meta": {}, + "blocks": [ + { + "t": "Header", + "c": [1, ["section", [], []], [{"t": "Str", "c": "section"}]], + }, + {"c": "valid", "t": "Str"}, + {"t": "does-not-exist", "c": []}, + {"c": "okay", "t": "Str"}, + ], + } + ) + parser = PandocParser(source) + msg = ( + "The parsed document contains 'does-not-exist', which is not " + "supported yet, please open an issue on GitHub" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + parser.generate() + + +def test_content_without_section_raises(): + source = json.dumps( + { + "pandoc-api-version": [1, 22, 2, 1], + "meta": {}, + "blocks": [ + {"c": "whoops", "t": "Str"}, + ], + } + ) + parser = PandocParser(source) + msg = ( + "Trying to add content but there is no current section, this is probably a " + "bug, please open an issue on GitHub" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + parser.generate() + + +def test_unsupported_markup_raises(): + match = re.escape("Markup of type does-not-exist is not supported (yet)") + with pytest.raises(ValueError, match=match): + PandocParser(source="", markup_type="does-not-exist") + + +def test_check_pandoc_installed_no_min_version_works(): + # check that it doesn't raise + check_pandoc_installed(min_version=None) + + +def test_check_pandoc_installed_min_version_too_high_raises(): + match = re.escape("Pandoc version too low, expected at least 999.9.9, got") + with pytest.raises(ValueError, match=match): + check_pandoc_installed(min_version="999.9.9") + + +def test_pandoc_not_installed(): + def raise_filenotfound(*args, **kwargs): + # error raised when trying to run subprocess on non-existing command + raise FileNotFoundError("[Errno 2] No such file or directory: 'pandoc'") + + with patch("subprocess.run", raise_filenotfound): + match = re.escape( + "This feature requires the pandoc library to be installed on your system" + ) + with pytest.raises(FileNotFoundError, match=match): + check_pandoc_installed() + + +def test_pandoc_version_cannot_be_determined(): + mock = Mock() + with patch("subprocess.run", mock): + match = re.escape("Could not determine version of pandoc") + with pytest.raises(RuntimeError, match=match): + check_pandoc_installed()