From faf9e3734ae700886e9b9ac71025e559ae77af13 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 25 Oct 2022 16:47:20 +0200 Subject: [PATCH 01/47] [skip ci] [WIP] Alternative Card implementation This is a suggestion to address the issues discussed on #72 Description The proposed model card implementation would allow to dynamically add sections or overwrite them. This is not a complete implementation but already covers most of the features we already have and then some. On top of these features, it would be possible to add more features like creating a default Card with placeholders, just like the exisint template, or the possibility to delete existing sections or to retrieve the result of a certain section. Implementation The underlying data structure consists of a dict and a Section dataclass. All data is stored in a _data attribute with the type dict[str, Section]. The dataclass hold the section contents, i.e. the section title, the section content, and subsections, which again have the same type. It's thus recursive data structure. Section title and dict key are identical, which is mostly for convenience. With this refactor, there are no separate data containers anymore for eval results, template sections, extra sections, etc. They are all treated the same. IMHO, this greatly simplifies the code overall. The only complex function that's left is the one needed to traverse the tree holding the data, and even that is just 14 LOC. Demo To see how the new class can be used, take a look at the main function. The resulting Card can be seen here: https://huggingface.co/skops-ci/hf_hub_example-fcc0d6fe-d072-4f94-8fdb-6bf3bb917bca --- skops/card/_card_alternative.py | 361 ++++++++++++++++++++++++++++++++ 1 file changed, 361 insertions(+) create mode 100644 skops/card/_card_alternative.py diff --git a/skops/card/_card_alternative.py b/skops/card/_card_alternative.py new file mode 100644 index 00000000..656ae18c --- /dev/null +++ b/skops/card/_card_alternative.py @@ -0,0 +1,361 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from pathlib import Path +from reprlib import Repr +from typing import Any, Iterator, Protocol + +from huggingface_hub import CardData +from sklearn.utils import estimator_html_repr +from tabulate import tabulate # type: ignore + +from skops.card._model_card import PlotSection, TableSection + +aRepr = Repr() +aRepr.maxother = 79 +aRepr.maxstring = 79 + + +def split_subsection_names(key: str) -> list[str]: + return key.split("/") + + +def _clean_table(table: str) -> str: + # replace line breaks "\n" with html tag
, however, leave end-of-line + # line breaks (eol_lb) intact + eol_lb = "|\n" + placeholder = "$%!?" # arbitrary sting that never appears naturally + table = ( + table.replace(eol_lb, placeholder) + .replace("\n", "
") + .replace(placeholder, eol_lb) + ) + return table + + +@dataclass +class Section: + title: str + content: Formattable | str | None = None + subsections: dict[str, Section] = field(default_factory=dict) + + +class Formattable(Protocol): + def format(self) -> str: + ... + + +class Card: + @classmethod + def make_default( + cls, model, model_diagram: bool = True, metadata: CardData | None = None + ): + """Add a bunch of default sections, yet to be implemented""" + raise NotImplementedError + + def __init__( + self, model, model_diagram: bool = True, metadata: CardData | None = None + ): + self.model = model + self.model_diagram = model_diagram + self.metadata = metadata or CardData() + + self._data: dict[str, Section] = {} + self._metrics: dict[str, float | int] = {} + self._reset() + + def _reset(self) -> None: + self._add_model(self.model) + + model_file = self.metadata.to_dict().get("model_file") + if model_file: + self._add_get_started_code(model_file) + + self._add_model_section() + self._add_hyperparams() + + def add(self, **kwargs: str) -> "Card": + for key, val in kwargs.items(): + self._add_single(key, val) + return self + + def _add_single(self, key: str, val: Formattable | str) -> None: + section = self._data + *subsection_names, leaf_node_name = split_subsection_names(key) + for subsection_name in subsection_names: + section_maybe = section.get(subsection_name) + + # there are already subsections + if section_maybe is not None: + section = section_maybe.subsections + continue + + # no subsection, create + entry = Section(title=subsection_name) + section[subsection_name] = entry + section = entry.subsections + + if leaf_node_name in section: + # entry exists, only overwrite content + section[leaf_node_name].content = val + else: + # entry does not exist, create a new one + section[leaf_node_name] = Section(title=leaf_node_name, content=val) + + def _add_model(self, model) -> None: + model = getattr(self, "model", None) + if model is None: + return + + model_str = self._strip_blank(repr(model)) + model_repr = aRepr.repr(f"model={model_str},").strip('"').strip("'") + self._add_single("Model description", model_repr) + + def _add_model_section(self) -> None: + if not self.model_diagram: + return + + model_plot_div = re.sub(r"\n\s+", "", str(estimator_html_repr(self.model))) + if model_plot_div.count("sk-top-container") == 1: + model_plot_div = model_plot_div.replace( + "sk-top-container", 'sk-top-container" style="overflow: auto;' + ) + self._add_single("Model Plot", model_plot_div) + + def _add_hyperparams(self) -> None: + hyperparameter_dict = self.model.get_params(deep=True) + table = _clean_table( + tabulate( + list(hyperparameter_dict.items()), + headers=["Hyperparameter", "Value"], + tablefmt="github", + ) + ) + self._add_single("Model description/Training Procedure/Hyperparameters", table) + + def add_plot(self, folded=False, **kwargs: str) -> "Card": + for plot_name, plot_path in kwargs.items(): + section = PlotSection(alt_text=plot_name, path=plot_path, folded=folded) + self._add_single(plot_name, section) + return self + + def add_table(self, folded: bool = False, **kwargs: dict["str", list[Any]]) -> Card: + for key, val in kwargs.items(): + section = TableSection(table=val, folded=folded) + self._add_single(key, section) + return self + + def add_metrics(self, **kwargs: int | float) -> "Card": + self._metrics.update(kwargs) + self._add_metrics(self._metrics) + return self + + def _add_metrics(self, metrics: dict[str, float | int]) -> None: + table = tabulate( + list(metrics.items()), + headers=["Metric", "Value"], + tablefmt="github", + ) + self._add_single("Model description/Evaluation Results", table) + + def _generate_metadata(self, metadata: CardData) -> Iterator[str]: + for key, val in metadata.to_dict().items() if metadata else {}: + if key == "widget": + yield "metadata.widget={...}," + continue + + yield aRepr.repr(f"metadata.{key}={val},").strip('"').strip("'") + + @staticmethod + def _strip_blank(text) -> str: + # remove new lines and multiple spaces + text = text.replace("\n", " ") + text = re.sub(r"\s+", r" ", text) + return text + + def _generate_content(self, data, depth: int = 1) -> Iterator[str]: + for val in data.values(): + title = f"{depth * '#'} {val.title}" + yield title + + if isinstance(val.content, str): + yield val.content + elif val.content is not None: # is Formattable + yield val.content.format() + + if val.subsections: + yield from self._generate_content(val.subsections, depth=depth + 1) + + def __str__(self) -> str: + return self.__repr__() + + def __repr__(self) -> str: + metadata_repr = "\n".join( + " " + line for line in self._generate_metadata(self.metadata) + ) + content_repr = "\n\n".join( + " " + line for line in self._generate_content(self._data) + ) + + complete_repr = "Card(\n" + if metadata_repr: + complete_repr += metadata_repr + "\n" + if content_repr: + complete_repr += content_repr + "\n" + complete_repr += ")" + return complete_repr + + def _add_get_started_code(self, file_name: str, indent: str = " ") -> None: + lines = [ + "import json", + "import pandas as pd", + ] + if file_name.endswith(".skops"): + lines += [ + "from skops.io import load", + f'model = load("{file_name}")', + ] + else: # pickle + lines += [ + "import pickle", + f"with open('{file_name}') as f:", + indent + "model = pickle.load(f)", + ] + + lines += [ + 'with open("config.json") as f:', + indent + "config = json.load(f)", + 'clf.predict(pd.DataFrame.from_dict(config["sklearn"]["example_input"]))', + ] + self._add_single("How to Get Started with the Model", "\n".join(lines)) + + def _generate_card(self) -> Iterator[str]: + if self.metadata: + yield f"---\n{self.metadata.to_yaml()}\n---" + + for line in self._generate_content(self._data): + yield "\n" + line + + def save(self, path: str | Path) -> None: + """Save the model card. + + This method renders the model card in markdown format and then saves it + as the specified file. + + Parameters + ---------- + path: str, or Path + Filepath to save your card. + + Notes + ----- + The keys in model card metadata can be seen `here + `__. + """ + with open(path, "w") as f: + f.write("\n".join(self._generate_card())) + + def render(self) -> str: + """Render the final model card as a string. + + Returns + ------- + card : str + The rendered model card with all placeholders filled and all extra + sections inserted. + """ + return "\n".join(self._generate_card()) + + +def main(): + import os + import pickle + import tempfile + from uuid import uuid4 + + import matplotlib.pyplot as plt + import sklearn + from huggingface_hub import HfApi + from sklearn.datasets import load_iris + from sklearn.linear_model import LogisticRegression + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + + from skops import hub_utils + from skops.card import metadata_from_config + + X, y = load_iris(return_X_y=True, as_frame=True) + + model = Pipeline( + [("scaler", StandardScaler()), ("clf", LogisticRegression(random_state=123))] + ).fit(X, y) + + pkl_file = tempfile.mkstemp(suffix=".pkl", prefix="skops-test")[1] + with open(pkl_file, "wb") as f: + pickle.dump(model, f) + + with tempfile.TemporaryDirectory(prefix="skops-test") as destination_path: + hub_utils.init( + model=pkl_file, + requirements=[f"scikit-learn=={sklearn.__version__}"], + dst=destination_path, + task="tabular-classification", + data=X, + ) + card = Card(model, metadata=metadata_from_config(destination_path)) + + # add a placeholder for figures + card.add(Plots="") + + # add arbitrary sections, overwrite them, etc. + card.add(hi="howdy") + card.add(**{"parent section/child section": "child content"}) + card.add(**{"foo": "bar", "spam": "eggs"}) + # change content of "hi" section + card.add(**{"hi/german": "guten tag", "hi/french": "salut"}) + card.add(**{"very/deeply/nested/section": "but why?"}) + + # add metrics + card.add_metrics(**{"acc": 0.1}) + + # insert the plot in the "Plot" section we inserted above + plt.plot([4, 5, 6, 7]) + plt.savefig(Path(destination_path) / "fig1.png") + card.add_plot(**{"Plots/A beautiful plot": "fig1.png"}) + + # add table + table = {"split": [1, 2, 3], "score": [4, 5, 6]} + card.add_table( + folded=True, + **{"Model description/Training Procedure/Yet another table": table}, + ) + + # more metrics + card.add_metrics(**{"f1": 0.2, "roc": 123}) + + # add content for "Model description" section, which has subsections but + # otherwise no content + card.add(**{"Model description": "This is a fantastic model"}) + + card.save(Path(destination_path) / "README.md") + print(destination_path) + + # pushing to Hub + token = os.environ["HF_HUB_TOKEN"] + repo_name = f"hf_hub_example-{uuid4()}" + user_name = HfApi().whoami(token=token)["name"] + repo_id = f"{user_name}/{repo_name}" + print(f"Creating and pushing to repo: {repo_id}") + hub_utils.push( + repo_id=repo_id, + source=destination_path, + token=token, + commit_message="testing model cards", + create_remote=True, + private=False, + ) + + +if __name__ == "__main__": + main() From cc229a209bfaf272242a656c9ebe9537db75afd1 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 31 Oct 2022 17:57:48 +0100 Subject: [PATCH 02/47] [WIP] Further align new model card design Added a test that shows that the new card produces the same output as the old card (except for a few non-deterministic parts). This includes most of the idiosyncrasies of the old card we might want to change in the future (e.g. inconsistent capitalization, use of empty lines). Some of the more problematic behaviors of the old card class were, however, fixed (e.g. creating an empty metrics table when there are no metrics). The other tests have been reworked to use the new card features to make them more precise. Often, that means that instead of having a very weak test like "assert 'foo' in card.render()", it is now possible to select the exact section and check that it equals the expected output. This work is still unfinished, specifically it still lacks tests for the card repr and for the newly added features. --- skops/card/_card_alternative.py | 174 ++++- skops/card/_model_card.py | 20 +- skops/card/default_template.md | 2 +- skops/card/tests/test_card.py | 2 +- skops/card/tests/test_card_alternative.py | 761 ++++++++++++++++++++++ 5 files changed, 913 insertions(+), 46 deletions(-) create mode 100644 skops/card/tests/test_card_alternative.py diff --git a/skops/card/_card_alternative.py b/skops/card/_card_alternative.py index 656ae18c..8326c10f 100644 --- a/skops/card/_card_alternative.py +++ b/skops/card/_card_alternative.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +import textwrap from dataclasses import dataclass, field from pathlib import Path from reprlib import Repr @@ -17,6 +18,43 @@ aRepr.maxstring = 79 +DEFAULT_TEMPLATE = { + "Model description": "[More Information Needed]", + "Model description/Intended uses & limitations": "[More Information Needed]", + "Model description/Training Procedure/Hyperparameters": """The model is trained with below hyperparameters. + +
+ Click to expand + +{{ hyperparameter_table }} + +
""", + "Model description/Training Procedure/Model Plot": "The model plot is below.", + "Model description/Evaluation Results": """You can find the details about evaluation process and the evaluation results. + + + +[More Information Needed]""", + "How to Get Started with the Model": """Use the code below to get started with the model. + +```python +[More Information Needed] +```""", + "Model Card Authors": """This model card is written by following authors: + +[More Information Needed]""", + "Model Card Contact": """You can contact the model card authors through following channels: +[More Information Needed]""", + "Citation": """Below you can find information related to citation. + +**BibTeX:** +``` +[More Information Needed] +``` +""", +} + + def split_subsection_names(key: str) -> list[str]: return key.split("/") @@ -47,27 +85,24 @@ def format(self) -> str: class Card: - @classmethod - def make_default( - cls, model, model_diagram: bool = True, metadata: CardData | None = None - ): - """Add a bunch of default sections, yet to be implemented""" - raise NotImplementedError - def __init__( - self, model, model_diagram: bool = True, metadata: CardData | None = None + self, + model, + model_diagram: bool = True, + metadata: CardData | None = None, + prefill: bool = True, ): self.model = model self.model_diagram = model_diagram self.metadata = metadata or CardData() self._data: dict[str, Section] = {} - self._metrics: dict[str, float | int] = {} + if prefill: + self._fill_default_sections() + self._metrics: dict[str, str | float | int] = {} self._reset() def _reset(self) -> None: - self._add_model(self.model) - model_file = self.metadata.to_dict().get("model_file") if model_file: self._add_get_started_code(model_file) @@ -75,14 +110,22 @@ def _reset(self) -> None: self._add_model_section() self._add_hyperparams() + def _fill_default_sections(self) -> None: + self.add(**DEFAULT_TEMPLATE) + def add(self, **kwargs: str) -> "Card": for key, val in kwargs.items(): self._add_single(key, val) return self - def _add_single(self, key: str, val: Formattable | str) -> None: + def _select( + self, subsection_names: list[str], create: bool = True + ) -> dict[str, Section]: + """TODO""" section = self._data - *subsection_names, leaf_node_name = split_subsection_names(key) + if not subsection_names: + return section + for subsection_name in subsection_names: section_maybe = section.get(subsection_name) @@ -91,10 +134,31 @@ def _add_single(self, key: str, val: Formattable | str) -> None: section = section_maybe.subsections continue - # no subsection, create - entry = Section(title=subsection_name) - section[subsection_name] = entry - section = entry.subsections + if create: + # no subsection, create + entry = Section(title=subsection_name) + section[subsection_name] = entry + section = entry.subsections + else: + raise KeyError(f"Section titles {subsection_name} does not exist") + + return section + + def select(self, key: str | list[str]) -> Section: + assert key # TODO + + if isinstance(key, str): + subsection_names = split_subsection_names(key) + else: + subsection_names = key + + parent_section = self._select(subsection_names[:-1], create=False) + return parent_section[subsection_names[-1]] + + def _add_single(self, key: str, val: Formattable | str) -> None: + section = self._data + *subsection_names, leaf_node_name = split_subsection_names(key) + section = self._select(subsection_names) if leaf_node_name in section: # entry exists, only overwrite content @@ -121,7 +185,11 @@ def _add_model_section(self) -> None: model_plot_div = model_plot_div.replace( "sk-top-container", 'sk-top-container" style="overflow: auto;' ) - self._add_single("Model Plot", model_plot_div) + template = "The model plot is below.\n\n{}" + self._add_single( + "Model description/Training Procedure/Model Plot", + template.format(model_plot_div), + ) def _add_hyperparams(self) -> None: hyperparameter_dict = self.model.get_params(deep=True) @@ -132,12 +200,26 @@ def _add_hyperparams(self) -> None: tablefmt="github", ) ) - self._add_single("Model description/Training Procedure/Hyperparameters", table) + template = textwrap.dedent( + """ The model is trained with below hyperparameters. + +
+ Click to expand + + {} + +
""" + ) + self._add_single( + "Model description/Training Procedure/Hyperparameters", + template.format(table), + ) def add_plot(self, folded=False, **kwargs: str) -> "Card": - for plot_name, plot_path in kwargs.items(): + for section_name, plot_path in kwargs.items(): + plot_name = split_subsection_names(section_name)[-1] section = PlotSection(alt_text=plot_name, path=plot_path, folded=folded) - self._add_single(plot_name, section) + self._add_single(section_name, section) return self def add_table(self, folded: bool = False, **kwargs: dict["str", list[Any]]) -> Card: @@ -146,18 +228,25 @@ def add_table(self, folded: bool = False, **kwargs: dict["str", list[Any]]) -> C self._add_single(key, section) return self - def add_metrics(self, **kwargs: int | float) -> "Card": + def add_metrics(self, **kwargs: str | int | float) -> "Card": self._metrics.update(kwargs) self._add_metrics(self._metrics) return self - def _add_metrics(self, metrics: dict[str, float | int]) -> None: + def _add_metrics(self, metrics: dict[str, str | float | int]) -> None: table = tabulate( list(metrics.items()), headers=["Metric", "Value"], tablefmt="github", ) - self._add_single("Model description/Evaluation Results", table) + template = textwrap.dedent( + """ You can find the details about evaluation process and the evaluation results. + + + + {}""" + ) + self._add_single("Model description/Evaluation Results", template.format(table)) def _generate_metadata(self, metadata: CardData) -> Iterator[str]: for key, val in metadata.to_dict().items() if metadata else {}: @@ -174,7 +263,9 @@ def _strip_blank(text) -> str: text = re.sub(r"\s+", r" ", text) return text - def _generate_content(self, data, depth: int = 1) -> Iterator[str]: + def _generate_content( + self, data: dict[str, Section], depth: int = 1 + ) -> Iterator[str]: for val in data.values(): title = f"{depth * '#'} {val.title}" yield title @@ -207,28 +298,41 @@ def __repr__(self) -> str: return complete_repr def _add_get_started_code(self, file_name: str, indent: str = " ") -> None: - lines = [ + is_skops_format = file_name.endswith(".skops") # else, assume pickle + + lines = ["```python"] + if is_skops_format: + lines += ["from skops.io import load"] + else: + lines += ["import joblib"] + + lines += [ "import json", "import pandas as pd", ] - if file_name.endswith(".skops"): + if is_skops_format: lines += [ "from skops.io import load", f'model = load("{file_name}")', ] else: # pickle - lines += [ - "import pickle", - f"with open('{file_name}') as f:", - indent + "model = pickle.load(f)", - ] + lines += [f"model = joblib.load({file_name})"] lines += [ 'with open("config.json") as f:', indent + "config = json.load(f)", - 'clf.predict(pd.DataFrame.from_dict(config["sklearn"]["example_input"]))', + 'model.predict(pd.DataFrame.from_dict(config["sklearn"]["example_input"]))', + "```", ] - self._add_single("How to Get Started with the Model", "\n".join(lines)) + template = textwrap.dedent( + """ Use the code below to get started with the model. + + {} + """ + ) + self._add_single( + "How to Get Started with the Model", template.format("\n".join(lines)) + ) def _generate_card(self) -> Iterator[str]: if self.metadata: @@ -261,7 +365,7 @@ def render(self) -> str: Returns ------- - card : str + result : str The rendered model card with all placeholders filled and all extra sections inserted. """ diff --git a/skops/card/_model_card.py b/skops/card/_model_card.py index c978e670..22f782e3 100644 --- a/skops/card/_model_card.py +++ b/skops/card/_model_card.py @@ -395,20 +395,20 @@ def _generate_card(self) -> ModelCard: template_sections["get_started_code"] = ( "from skops.io import load\nimport json\n" "import pandas as pd\n" - f'clf = load("{model_file}")\n' + f'model = load("{model_file}")\n' 'with open("config.json") as f:\n ' " config =" " json.load(f)\n" - 'clf.predict(pd.DataFrame.from_dict(config["sklearn"]["example_input"]))' + 'model.predict(pd.DataFrame.from_dict(config["sklearn"]["example_input"]))' ) else: template_sections["get_started_code"] = ( - "import joblib\nimport json\nimport pandas as pd\nclf =" + "import joblib\nimport json\nimport pandas as pd\nmodel =" f' joblib.load({model_file})\nwith open("config.json") as' " f:\n " " config =" " json.load(f)\n" - 'clf.predict(pd.DataFrame.from_dict(config["sklearn"]["example_input"]))' + 'model.predict(pd.DataFrame.from_dict(config["sklearn"]["example_input"]))' ) if self.model_diagram is True: model_plot_div = re.sub(r"\n\s+", "", str(estimator_html_repr(self.model))) @@ -419,11 +419,13 @@ def _generate_card(self) -> ModelCard: model_plot: str | None = model_plot_div else: model_plot = None - template_sections["eval_results"] = tabulate( - list(self._eval_results.items()), - headers=["Metric", "Value"], - tablefmt="github", - ) + + if self._eval_results: # only add metrics if there are any + template_sections["eval_results"] = tabulate( + list(self._eval_results.items()), + headers=["Metric", "Value"], + tablefmt="github", + ) # if template path is not given, use default if template_sections.get("template_path") is None: diff --git a/skops/card/default_template.md b/skops/card/default_template.md index edbc8d49..91141dfe 100644 --- a/skops/card/default_template.md +++ b/skops/card/default_template.md @@ -29,7 +29,7 @@ The model plot is below. {{ model_plot }} -## Evaluation Results +## Evaluation Results You can find the details about evaluation process and the evaluation results. diff --git a/skops/card/tests/test_card.py b/skops/card/tests/test_card.py index e2ed4596..4cb6d4ec 100644 --- a/skops/card/tests/test_card.py +++ b/skops/card/tests/test_card.py @@ -222,7 +222,7 @@ def test_code_autogeneration_skops( filename = metadata["model_file"] with open(Path(destination_path) / "README.md") as f: read_buffer = f.read() - assert f'clf = load("{filename}")' in read_buffer + assert f'model = load("{filename}")' in read_buffer # test if the model doesn't overflow the huggingface models page assert read_buffer.count("sk-top-container") == 1 diff --git a/skops/card/tests/test_card_alternative.py b/skops/card/tests/test_card_alternative.py new file mode 100644 index 00000000..b723b3e2 --- /dev/null +++ b/skops/card/tests/test_card_alternative.py @@ -0,0 +1,761 @@ +import copy +import os +import pickle +import tempfile +from itertools import zip_longest +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import pytest +import sklearn +from huggingface_hub import CardData, metadata_load +from sklearn.datasets import load_iris +from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.tree import DecisionTreeClassifier + +import skops +from skops import hub_utils +from skops.card import metadata_from_config +from skops.card._card_alternative import Card +from skops.card._model_card import PlotSection, TableSection +from skops.io import dump + + +def fit_model(): + X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) + y = np.dot(X, np.array([1, 2])) + 3 + reg = LinearRegression().fit(X, y) + return reg + + +@pytest.fixture +def model_card(model_diagram=True): + model = fit_model() + card = Card(model, model_diagram) + yield card + + +@pytest.fixture +def iris_data(): + X, y = load_iris(return_X_y=True, as_frame=True) + yield X, y + + +@pytest.fixture +def iris_estimator(iris_data): + X, y = iris_data + est = LogisticRegression(solver="liblinear").fit(X, y) + yield est + + +@pytest.fixture +def iris_pkl_file(iris_estimator): + pkl_file = tempfile.mkstemp(suffix=".pkl", prefix="skops-test")[1] + with open(pkl_file, "wb") as f: + pickle.dump(iris_estimator, f) + yield pkl_file + + +@pytest.fixture +def iris_skops_file(iris_estimator): + skops_folder = tempfile.mkdtemp() + model_name = "model.skops" + skops_path = Path(skops_folder) / model_name + dump(iris_estimator, skops_path) + yield skops_path + + +def _create_model_card_from_saved_model( + destination_path, + iris_estimator, + iris_data, + save_file, +): + X, y = iris_data + hub_utils.init( + model=save_file, + requirements=[f"scikit-learn=={sklearn.__version__}"], + dst=destination_path, + task="tabular-classification", + data=X, + ) + card = Card(iris_estimator, metadata=metadata_from_config(destination_path)) + card.save(Path(destination_path) / "README.md") + return card + + +@pytest.fixture +def skops_model_card_metadata_from_config( + destination_path, iris_estimator, iris_skops_file, iris_data +): + yield _create_model_card_from_saved_model( + destination_path, iris_estimator, iris_data, iris_skops_file + ) + + +@pytest.fixture +def pkl_model_card_metadata_from_config( + destination_path, iris_estimator, iris_pkl_file, iris_data +): + yield _create_model_card_from_saved_model( + destination_path, iris_estimator, iris_data, iris_pkl_file + ) + + +@pytest.fixture +def destination_path(): + with tempfile.TemporaryDirectory(prefix="skops-test") as dir_path: + yield Path(dir_path) + + +def test_save_model_card(destination_path, model_card): + model_card.save(Path(destination_path) / "README.md") + assert (Path(destination_path) / "README.md").exists() + + +def test_select_existing_section(): + # TODO + pass + + +def test_select_non_existing_section_raises(): + # TODO + pass + + +def test_hyperparameter_table(destination_path, model_card): + section_name = "Model description/Training Procedure/Hyperparameters" + text_hyperparams = model_card.select(section_name).content + expected = "\n".join( + [ + "The model is trained with below hyperparameters.", + "", + "
", + " Click to expand ", + "", + "| Hyperparameter | Value |", + "|------------------|------------|", + "| copy_X | True |", + "| fit_intercept | True |", + "| n_jobs | |", + "| normalize | deprecated |", + "| positive | False |", + "", + "
", + ] + ) + assert text_hyperparams == expected + + +def _strip_multiple_chars(text, char): + # _strip_multiple_chars("hi there") == "hi there" + # _strip_multiple_chars("|---|--|", "-") == "|-|-|" + while char + char in text: + text = text.replace(char + char, char) + return text + + +def test_hyperparameter_table_with_line_break(destination_path): + # Hyperparameters can contain values with line breaks, "\n", in them. In + # that case, the markdown table is broken. Check that the hyperparameter + # table we create properly replaces the "\n" with "
". + class EstimatorWithLbInParams: + def get_params(self, deep=False): + return {"fit_intercept": True, "n_jobs": "line\nwith\nbreak"} + + model_card = Card(EstimatorWithLbInParams()) + section_name = "Model description/Training Procedure/Hyperparameters" + text_hyperparams = model_card.select(section_name).content + + # remove multiple whitespaces, as they're not important + text_cleaned = _strip_multiple_chars(text_hyperparams, " ") + assert "| n_jobs | line
with
break |" in text_cleaned + + +def test_plot_model(destination_path, model_card): + text_plot = model_card.select( + "Model description/Training Procedure/Model Plot" + ).content + # don't compare whole text, as it's quite long and non-deterministic + assert text_plot.startswith("The model plot is below.\n\n + readme_str1 = readme_str1.replace("\n", "") + + assert readme_str0 == readme_str1 + + +def test_parsed_card_identical(card, tmp_path): + file0 = tmp_path / "readme-skops.md" + card.save(file0) + + parsed_card = parse_modelcard(file0) + file1 = tmp_path / "readme-parsed.md" + parsed_card.save(file1) + + assert_readme_files_equal(file0, file1) From 26892d6978c654c9ba0537295f11b60861dcb31b Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Thu, 1 Dec 2022 14:49:17 +0100 Subject: [PATCH 19/47] Error when calling add_metric w/ invalid template --- skops/card/_model_card.py | 18 +++++++++++++++--- skops/card/tests/test_card.py | 16 ++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/skops/card/_model_card.py b/skops/card/_model_card.py index e7933232..0308127e 100644 --- a/skops/card/_model_card.py +++ b/skops/card/_model_card.py @@ -763,10 +763,19 @@ def _add_metrics(self, metrics: dict[str, str | float | int]) -> None: """Add metrics to the Evaluation Results section""" # when not using one of the default templates, there is no predetermined # section to put the metrics - if self.template is None or isinstance(self.template, dict): + if (not self.template) or isinstance(self.template, dict): + raise ValueError( + "Adding metrics is only possible with one of the default templates, " + f"i.e. one of {sorted(VALID_TEMPLATES)}. Instead, consider using the " + ".add method to add a metric to a section, or .add_table to add a " + "table of metrics." + ) return if self.template not in VALID_TEMPLATES: - return + raise ValueError( + f"Unknown template {self.template}, must be " + f"one of {sorted(VALID_TEMPLATES)}" + ) if self._metrics: data_transposed = zip(*self._metrics.items()) # make column oriented @@ -789,7 +798,10 @@ def _add_metrics(self, metrics: dict[str, str | float | int]) -> None: section = "Evaluation/Testing Data, Factors & Metrics/Metrics" else: # should be unreachable - raise ValueError(f"Unknown template {self.template}") + raise ValueError( + f"Unknown template {self.template}, must be " + f"one of {sorted(VALID_TEMPLATES)}" + ) self._add_single(section, template.format(table)) diff --git a/skops/card/tests/test_card.py b/skops/card/tests/test_card.py index 508b9a2c..f3dc34d3 100644 --- a/skops/card/tests/test_card.py +++ b/skops/card/tests/test_card.py @@ -541,6 +541,22 @@ def test_add_metrics(destination_path, model_card): assert eval_metric_content.endswith(expected) +@pytest.mark.parametrize( + "template, msg", + [ + (None, "Adding metrics is only possible with one of"), + ({"My custom template": ""}, "Adding metrics is only possible with one of"), + ("does-not-exist", "Unknown template does-not-exist, must be one of"), + ], +) +def test_add_metric_no_template_raises(template, msg): + # when the template is not one of the standard templates, we cannot know + # where to put the metric, so this should fail with a helpful error message + with pytest.raises(ValueError, match=msg): + card = Card(None, template=template) + card.add_metrics(f1=0.1) + + def test_code_autogeneration( model_card, destination_path, pkl_model_card_metadata_from_config ): From 80a4ae38e240a8f9c01b083eb8ed08fa72ca6724 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 2 Dec 2022 14:28:35 +0100 Subject: [PATCH 20/47] Support model cards with yaml header Most model cards have a yaml section at the top. It is now detached before parsing with pandoc, then re-added afterwards. Add a test with the model card from bert-base-uncased. It still fails with some minor issues at the moment (most notably table column alignment). --- skops/card/_parser.py | 58 ++++ .../card/tests/examples/bert-base-uncased.md | 249 ++++++++++++++++++ skops/card/tests/test_parser.py | 20 ++ 3 files changed, 327 insertions(+) create mode 100644 skops/card/tests/examples/bert-base-uncased.md diff --git a/skops/card/_parser.py b/skops/card/_parser.py index c4b7344d..e849852d 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -6,9 +6,15 @@ """ +from __future__ import annotations + import json import subprocess from pathlib import Path +from tempfile import mkdtemp +from typing import Any + +import yaml # type: ignore from skops.card import Card from skops.card._model_card import Section @@ -107,6 +113,54 @@ def check_pandoc_installed() -> None: raise FileNotFoundError(msg) from exc +def _card_with_detached_metainfo(path: str | Path) -> tuple[str | Path, dict[str, Any]]: + """Detach the possibly existing yaml part of the model card + + Model cards always have a markdown part and optionally a yaml part at the + head, delimited by "---". Obviously, pandoc cannot parse that. Therefore, we + detach the yaml part and return it as a separate dict, only leaving + (hopefully) valid markdown. + + path : str or pathlib.Path + The path to the model card file. + + Returns + ------- + file : path + The path to the model card without any yaml metainfo. If the model card + didn't contain that metainfo to begin with, this is just the path to the + original model card. If it did contain metainfo, this is a path to a new + temporary file with the metainfo removed. + + metainfo : dict + The metainfo from the yaml part as a parsed dict. If no metainfo was + present, the dict is empty. + """ + with open(path, "r") as f: + text = f.read() + + sep_start, sep_end = "---\n", "\n---" + + metainfo: dict[str, Any] = {} + if not text.startswith(sep_start): # no metainfo: + return path, metainfo + + idx_separator = text.find(sep_end) + if idx_separator < len(sep_start): # separator shouldn't come earlier than this + return path, metainfo + + # https://black.readthedocs.io/en/stable/faq.html#why-are-flake8-s-e203-and-w503-violated + text_clean = text[idx_separator + len(sep_end) :] # noqa: E203 + metainfo = yaml.safe_load( # type: ignore + text[len(sep_start) : idx_separator] # noqa: E203 + ) + + file = Path(mkdtemp()) / "tmp-model-card.md" + with open(file, "w") as f: + f.write(text_clean) + return file, metainfo + + def parse_modelcard(path: str | Path) -> Card: """Read a model card and return a Card object @@ -148,6 +202,8 @@ def parse_modelcard(path: str | Path) -> Card: """ check_pandoc_installed() + path, metainfo = _card_with_detached_metainfo(path) + proc = subprocess.run( ["pandoc", "-t", "json", "-s", str(path)], capture_output=True, @@ -156,5 +212,7 @@ def parse_modelcard(path: str | Path) -> Card: parser = PandocParser(source) card = parser.generate() + for key, val in metainfo.items(): + setattr(card.metadata, key, val) return card diff --git a/skops/card/tests/examples/bert-base-uncased.md b/skops/card/tests/examples/bert-base-uncased.md new file mode 100644 index 00000000..e762fc35 --- /dev/null +++ b/skops/card/tests/examples/bert-base-uncased.md @@ -0,0 +1,249 @@ +--- +language: en +tags: +- exbert +license: apache-2.0 +datasets: +- bookcorpus +- wikipedia +--- + +# BERT base model (uncased) + + + +Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in +[this paper](https://arxiv.org/abs/1810.04805) and first released in +[this repository](https://github.com/google-research/bert). This model is uncased: it does not make a difference +between english and English. + +Disclaimer: The team releasing BERT did not write a model card for this model so this model card has been written by +the Hugging Face team. + +## Model description + +BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it +was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of +publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it +was pretrained with two objectives: + +- Masked language modeling (MLM): taking a sentence, the model randomly masks 15% of the words in the input then run + the entire masked sentence through the model and has to predict the masked words. This is different from traditional + recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like + GPT which internally masks the future tokens. It allows the model to learn a bidirectional representation of the + sentence. +- Next sentence prediction (NSP): the models concatenates two masked sentences as inputs during pretraining. Sometimes + they correspond to sentences that were next to each other in the original text, sometimes not. The model then has to + predict if the two sentences were following each other or not. + +This way, the model learns an inner representation of the English language that can then be used to extract features +useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard +classifier using the features produced by the BERT model as inputs. + +## Model variations + +BERT has originally been released in base and large variations, for cased and uncased input text. The uncased models also strips out an accent markers. +Chinese and multilingual uncased and cased versions followed shortly after. +Modified preprocessing with whole word masking has replaced subpiece masking in a following work, with the release of two models. +Other 24 smaller models are released afterward. + +The detailed release history can be found on the [google-research/bert readme](https://github.com/google-research/bert/blob/master/README.md) on github. + +| Model | #params | Language | +|------------------------|--------------------------------|-------| +| [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M | English | +| [`bert-large-uncased`](https://huggingface.co/bert-large-uncased) | 340M | English | sub +| [`bert-base-cased`](https://huggingface.co/bert-base-cased) | 110M | English | +| [`bert-large-cased`](https://huggingface.co/bert-large-cased) | 340M | English | +| [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 110M | Chinese | +| [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple | +| [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English | +| [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English | + +## Intended uses & limitations + +You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to +be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?filter=bert) to look for +fine-tuned versions of a task that interests you. + +Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) +to make decisions, such as sequence classification, token classification or question answering. For tasks such as text +generation you should look at model like GPT2. + +### How to use + +You can use this model directly with a pipeline for masked language modeling: + +```python +>>> from transformers import pipeline +>>> unmasker = pipeline('fill-mask', model='bert-base-uncased') +>>> unmasker("Hello I'm a [MASK] model.") +[{'sequence': "[CLS] hello i'm a fashion model. [SEP]", + 'score': 0.1073106899857521, + 'token': 4827, + 'token_str': 'fashion'}, + {'sequence': "[CLS] hello i'm a role model. [SEP]", + 'score': 0.08774490654468536, + 'token': 2535, + 'token_str': 'role'}, + {'sequence': "[CLS] hello i'm a new model. [SEP]", + 'score': 0.05338378623127937, + 'token': 2047, + 'token_str': 'new'}, + {'sequence': "[CLS] hello i'm a super model. [SEP]", + 'score': 0.04667217284440994, + 'token': 3565, + 'token_str': 'super'}, + {'sequence': "[CLS] hello i'm a fine model. [SEP]", + 'score': 0.027095865458250046, + 'token': 2986, + 'token_str': 'fine'}] +``` + +Here is how to use this model to get the features of a given text in PyTorch: + +```python +from transformers import BertTokenizer, BertModel +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') +model = BertModel.from_pretrained("bert-base-uncased") +text = "Replace me by any text you'd like." +encoded_input = tokenizer(text, return_tensors='pt') +output = model(**encoded_input) +``` + +and in TensorFlow: + +```python +from transformers import BertTokenizer, TFBertModel +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') +model = TFBertModel.from_pretrained("bert-base-uncased") +text = "Replace me by any text you'd like." +encoded_input = tokenizer(text, return_tensors='tf') +output = model(encoded_input) +``` + +### Limitations and bias + +Even if the training data used for this model could be characterized as fairly neutral, this model can have biased +predictions: + +```python +>>> from transformers import pipeline +>>> unmasker = pipeline('fill-mask', model='bert-base-uncased') +>>> unmasker("The man worked as a [MASK].") +[{'sequence': '[CLS] the man worked as a carpenter. [SEP]', + 'score': 0.09747550636529922, + 'token': 10533, + 'token_str': 'carpenter'}, + {'sequence': '[CLS] the man worked as a waiter. [SEP]', + 'score': 0.0523831807076931, + 'token': 15610, + 'token_str': 'waiter'}, + {'sequence': '[CLS] the man worked as a barber. [SEP]', + 'score': 0.04962705448269844, + 'token': 13362, + 'token_str': 'barber'}, + {'sequence': '[CLS] the man worked as a mechanic. [SEP]', + 'score': 0.03788609802722931, + 'token': 15893, + 'token_str': 'mechanic'}, + {'sequence': '[CLS] the man worked as a salesman. [SEP]', + 'score': 0.037680890411138535, + 'token': 18968, + 'token_str': 'salesman'}] +>>> unmasker("The woman worked as a [MASK].") +[{'sequence': '[CLS] the woman worked as a nurse. [SEP]', + 'score': 0.21981462836265564, + 'token': 6821, + 'token_str': 'nurse'}, + {'sequence': '[CLS] the woman worked as a waitress. [SEP]', + 'score': 0.1597415804862976, + 'token': 13877, + 'token_str': 'waitress'}, + {'sequence': '[CLS] the woman worked as a maid. [SEP]', + 'score': 0.1154729500412941, + 'token': 10850, + 'token_str': 'maid'}, + {'sequence': '[CLS] the woman worked as a prostitute. [SEP]', + 'score': 0.037968918681144714, + 'token': 19215, + 'token_str': 'prostitute'}, + {'sequence': '[CLS] the woman worked as a cook. [SEP]', + 'score': 0.03042375110089779, + 'token': 5660, + 'token_str': 'cook'}] +``` + +This bias will also affect all fine-tuned versions of this model. + +## Training data + +The BERT model was pretrained on [BookCorpus](https://yknzhu.wixsite.com/mbweb), a dataset consisting of 11,038 +unpublished books and [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia) (excluding lists, tables and +headers). + +## Training procedure + +### Preprocessing + +The texts are lowercased and tokenized using WordPiece and a vocabulary size of 30,000. The inputs of the model are +then of the form: + +``` +[CLS] Sentence A [SEP] Sentence B [SEP] +``` + +With probability 0.5, sentence A and sentence B correspond to two consecutive sentences in the original corpus, and in +the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a +consecutive span of text usually longer than a single sentence. The only constrain is that the result with the two +"sentences" has a combined length of less than 512 tokens. + +The details of the masking procedure for each sentence are the following: +- 15% of the tokens are masked. +- In 80% of the cases, the masked tokens are replaced by `[MASK]`. +- In 10% of the cases, the masked tokens are replaced by a random token (different) from the one they replace. +- In the 10% remaining cases, the masked tokens are left as is. + +### Pretraining + +The model was trained on 4 cloud TPUs in Pod configuration (16 TPU chips total) for one million steps with a batch size +of 256. The sequence length was limited to 128 tokens for 90% of the steps and 512 for the remaining 10%. The optimizer +used is Adam with a learning rate of 1e-4, \\(\beta_{1} = 0.9\\) and \\(\beta_{2} = 0.999\\), a weight decay of 0.01, +learning rate warmup for 10,000 steps and linear decay of the learning rate after. + +## Evaluation results + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | +|:----:|:-----------:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:-------:| +| | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | + + +### BibTeX entry and citation info + +```bibtex +@article{DBLP:journals/corr/abs-1810-04805, + author = {Jacob Devlin and + Ming{-}Wei Chang and + Kenton Lee and + Kristina Toutanova}, + title = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language + Understanding}, + journal = {CoRR}, + volume = {abs/1810.04805}, + year = {2018}, + url = {http://arxiv.org/abs/1810.04805}, + archivePrefix = {arXiv}, + eprint = {1810.04805}, + timestamp = {Tue, 30 Oct 2018 20:39:56 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-1810-04805.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` + + + + diff --git a/skops/card/tests/test_parser.py b/skops/card/tests/test_parser.py index be33e72e..87d54d84 100644 --- a/skops/card/tests/test_parser.py +++ b/skops/card/tests/test_parser.py @@ -1,3 +1,6 @@ +import os +from pathlib import Path + import numpy as np import pytest from sklearn.linear_model import LinearRegression @@ -73,3 +76,20 @@ def test_parsed_card_identical(card, tmp_path): parsed_card.save(file1) assert_readme_files_equal(file0, file1) + + +@pytest.mark.xfail(reason="small diff, especially in tables") +def test_bert_base_uncased(tmp_path): + file0 = ( + Path(os.getcwd()) + / "skops" + / "card" + / "tests" + / "examples" + / "bert-base-uncased.md" + ) + parsed_card = parse_modelcard(file0) + file1 = tmp_path / "readme-parsed.md" + parsed_card.save(file1) + + assert_readme_files_equal(file0, file1) From 9744998d0d80d358b599a14ecc943e47f0dae326 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 2 Dec 2022 14:30:16 +0100 Subject: [PATCH 21/47] Add a bunch of more markup support Now supports: - Space - Strong - Emph - Strikeout - Subscript - Superscript - Plain - Str - RawInline - RawBlock - SoftBreak - LineBreak - Para - Header - Image - CodeBlock - Code - Table - Div - Link - BulletList - Quoted --- skops/card/_markup.py | 80 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 76 insertions(+), 4 deletions(-) diff --git a/skops/card/_markup.py b/skops/card/_markup.py index 6c2f8846..a0425f81 100644 --- a/skops/card/_markup.py +++ b/skops/card/_markup.py @@ -17,7 +17,7 @@ class Markdown: This class has a ``mapping`` attribute, which is just a dict. The keys are Pandoc types and the values are functions that transform the corresponding value into a string with markdown syntax. Those functions are all prefixed - with ``md_``, e.g. ``md_Image`` for transforming a pandoc ``Image`` into a + with ``md_``, e.g. ``md_image`` for transforming a pandoc ``Image`` into a markdown figure. From the caller side, only the ``__call__`` method should be used, the rest @@ -30,17 +30,26 @@ def __init__(self): self.mapping = { "Space": self.md_space, "Strong": self.md_strong, + "Emph": self.md_emph, + "Strikeout": self.md_strikeout, + "Subscript": self.md_subscript, + "Superscript": self.md_superscript, "Plain": self.md_plain, "Str": self.md_str, "RawInline": self.md_rawline, "RawBlock": self.md_raw_block, "SoftBreak": self.md_softbreak, + "LineBreak": self.md_linebreak, "Para": self.md_para, "Header": self.md_header, "Image": self.md_image, "CodeBlock": self.md_code_block, + "Code": self.md_code, "Table": self.md_table, "Div": self.md_parse_div, + "Link": self.md_link, + "BulletList": self.md_bullet_list, + "Quoted": self.md_quoted, } @staticmethod @@ -53,6 +62,30 @@ def md_strong(self, value) -> str: parts.append("**") return "".join(parts) + def md_emph(self, value) -> str: + parts = ["_"] + parts += [self.__call__(subitem) for subitem in value] + parts.append("_") + return "".join(parts) + + def md_strikeout(self, value) -> str: + parts = ["~~"] + parts += [self.__call__(subitem) for subitem in value] + parts.append("~~") + return "".join(parts) + + def md_subscript(self, value) -> str: + parts = [""] + parts += [self.__call__(subitem) for subitem in value] + parts.append("") + return "".join(parts) + + def md_superscript(self, value) -> str: + parts = [""] + parts += [self.__call__(subitem) for subitem in value] + parts.append("") + return "".join(parts) + def md_plain(self, value) -> str: parts = [self.__call__(subitem) for subitem in value] return "".join(parts) @@ -76,6 +109,10 @@ def md_raw_block(self, item) -> str: def md_softbreak(value) -> str: return "\n" + @staticmethod + def md_linebreak(value) -> str: + return "\n" + def _make_content(self, content): parts = [] for item in content: @@ -105,14 +142,19 @@ def md_image(self, value) -> str: @staticmethod def md_code_block(item: tuple[tuple[int, list[str], list[str]], str]) -> str: # a codeblock consists of: (id, classes, namevals) contents - (_, _, namevals), content = item + (_, classes, _), content = item block_start = "```" - if namevals: # TODO: check if this makes "```python" etc. - block_start += namevals[0] + if classes: + block_start += ", ".join(classes) block_end = "```" content = "\n".join((block_start, content, block_end)) return content + @staticmethod + def md_code(item: tuple[Any, str]) -> str: + _, txt = item + return f"`{txt}`" + def md_table(self, item) -> str: _, alignments, _, header, rows = item fn = self.__call__ @@ -162,6 +204,36 @@ def md_parse_div(self, item) -> str: end = "" return "".join([start] + middle + [end]) + def md_link(self, item) -> str: + _, txt, (src, _) = item + txt_formatted = self._make_content(txt) + return f"[{txt_formatted}]({src})" + + def md_bullet_list(self, item) -> str: + parts = [] + for subitem in item: + assert len(subitem) == 1 + content = "".join(self.__call__(i) for i in subitem) + # indent the lines in lists if they contain line breaks + content = content.replace("\n", "\n ") + parts.append(f"- {content}") + return "\n".join(parts) + + def md_quoted(self, item: tuple[dict[str, str], list[PandocItem]]) -> str: + quote_type, content = item + type_ = quote_type["t"] + try: + sym = {"DoubleQuote": '"', "SingleQuote": "'"}[type_] + except KeyError as exc: + msg = ( + f"The parsed document contains '{type_}', which is not " + "supported yet, please open an issue on GitHub" + ) + raise ValueError(msg) from exc + + text = "".join(self.__call__(i) for i in content) + return f"{sym}{text}{sym}" + def __call__(self, item: str | PandocItem) -> str: if isinstance(item, str): return item From d8649411df5329746705cb6ec6f4c7cc1a714d3a Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 5 Dec 2022 16:55:22 +0100 Subject: [PATCH 22/47] Add more test cases for model card parser I added 5 model cards from the top 10 most used models from the Hub (I excluded cards that were too similar to one another). The tests were rewritten so that they should now pass. There are some limitations to the parser that results in the generated cards not being 100% identical. Those limiations are now documented. However, I don't believe that those limitations matter, as they make no semantic difference but rather are stylistic or even invisible. The most notable difference is the alignment of columns in tables. The tests pass despite those differences because I rewrote them to include a diff file for each model card. When the generated card is compared to the original one, a diff is created and compared to the checked in diff. This way, we have control over what diff we permit. I had to exclude the folder containing the cards and diffs from the pre-commit task "trailing-whitespace", as we need the trailing whitespace in there. --- .pre-commit-config.yaml | 1 + skops/card/_markup.py | 16 +- skops/card/_parser.py | 35 +++- .../tests/examples/bert-base-uncased.md.diff | 40 +++++ .../tests/examples/clip-vit-large-patch14.md | 147 +++++++++++++++ .../examples/clip-vit-large-patch14.md.diff | 19 ++ skops/card/tests/examples/gpt2.md | 168 ++++++++++++++++++ skops/card/tests/examples/gpt2.md.diff | 22 +++ skops/card/tests/examples/specter.md | 26 +++ skops/card/tests/examples/specter.md.diff | 11 ++ .../examples/vit-base-patch32-224-in21k.md | 94 ++++++++++ .../vit-base-patch32-224-in21k.md.diff | 5 + skops/card/tests/test_parser.py | 70 ++++---- 13 files changed, 619 insertions(+), 35 deletions(-) create mode 100644 skops/card/tests/examples/bert-base-uncased.md.diff create mode 100644 skops/card/tests/examples/clip-vit-large-patch14.md create mode 100644 skops/card/tests/examples/clip-vit-large-patch14.md.diff create mode 100644 skops/card/tests/examples/gpt2.md create mode 100644 skops/card/tests/examples/gpt2.md.diff create mode 100644 skops/card/tests/examples/specter.md create mode 100644 skops/card/tests/examples/specter.md.diff create mode 100644 skops/card/tests/examples/vit-base-patch32-224-in21k.md create mode 100644 skops/card/tests/examples/vit-base-patch32-224-in21k.md.diff diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4068292e..319d9a63 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,6 +6,7 @@ repos: exclude: .github/conda/meta.yaml - id: end-of-file-fixer - id: trailing-whitespace + exclude: skops/card/tests/examples - id: check-case-conflict - id: check-merge-conflict - repo: https://github.com/psf/black diff --git a/skops/card/_markup.py b/skops/card/_markup.py index a0425f81..9ddf1afd 100644 --- a/skops/card/_markup.py +++ b/skops/card/_markup.py @@ -50,6 +50,7 @@ def __init__(self): "Link": self.md_link, "BulletList": self.md_bullet_list, "Quoted": self.md_quoted, + "BlockQuote": self.md_block_quote, } @staticmethod @@ -92,7 +93,8 @@ def md_plain(self, value) -> str: @staticmethod def md_str(value) -> str: - return value + # escape \ + return value.replace("\\", "\\\\") @staticmethod def md_rawline(value) -> str: @@ -234,6 +236,18 @@ def md_quoted(self, item: tuple[dict[str, str], list[PandocItem]]) -> str: text = "".join(self.__call__(i) for i in content) return f"{sym}{text}{sym}" + def md_block_quote(self, item: list[PandocItem]) -> str: + parts = [] + for subitem in item: + content = self.__call__(subitem) + # add quote symbolx + content = content.replace("\n", "\n> ") + parts.append(content) + + # add a quote symbol to the very start + text = "> " + "\n> ".join(parts) + return text + def __call__(self, item: str | PandocItem) -> str: if isinstance(item, str): return item diff --git a/skops/card/_parser.py b/skops/card/_parser.py index e849852d..933ea1d8 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -75,16 +75,21 @@ def parse_header(self, item: PandocItem) -> str: self._section_trace = self._section_trace[: level - 1] + [content] return content + def post_process(self, res: str) -> str: + # replace Latin1 space + res = res.replace("\xa0", " ") + return res + def generate(self) -> Card: # Parsing the flat structure, not recursively as in pandocfilters. # After visiting the parent node, it's not necessary to visit its # child nodes, because that's already done during parsing. for item in json.loads(self.source)["blocks"]: if item["t"] == "Header": - res = self.parse_header(item) + res = self.post_process(self.parse_header(item)) self.add_section(res) else: - res = self.mapping(item) + res = self.post_process(self.mapping(item)) self.add_content(res) return self.card @@ -189,6 +194,32 @@ def parse_modelcard(path: str | Path) -> Card: >>> # overwrite old card with new one >>> parsed_card.save("README.md") + Notes + ----- + There are some **known limitations** to the parser that may result in the + model card generated from the parsed file not being 100% identical to the + original model card: + + - In markdown, bold and italic text can be encoded in different fashions, + e.g. ``_like this_`` or ``*like this*`` for italic text. Pandoc doesn't + differentiate between the two. Therefore, the output may use one method + where the original card used the other. When rendered, the two results + should, however, be the same. + - Table alignment may be different. At the moment, skops does not make use + of column alignment information in tables, so that may differ. + - Quote symbols may differ, e.g. ``it’s`` becoming ``it's``. + - The number of empty lines may differ, e.g. two empty lines being + transformed into one empty line. + - Trailing whitespace is removed. + - Tab indentation may be removed, e.g. in raw html. + - The yaml part of the model card can have some non-semantic differences, + like omitting optional quotation marks. + + For these reasons, please don't expect the output of a parsed card to be + 100% identical to the original input. However, none of the listed changes + makes any _semantic_ difference. If you find that there is a semantic + difference in the output, please open an issue on GitHub. + Parameters ---------- path : str or pathlib.Path diff --git a/skops/card/tests/examples/bert-base-uncased.md.diff b/skops/card/tests/examples/bert-base-uncased.md.diff new file mode 100644 index 00000000..e4fb5c66 --- /dev/null +++ b/skops/card/tests/examples/bert-base-uncased.md.diff @@ -0,0 +1,40 @@ +--- ++++ +@@ -52,10 +52,10 @@ +-| Model | #params | Language | +-|------------------------|--------------------------------|-------| +-| [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M | English | +-| [`bert-large-uncased`](https://huggingface.co/bert-large-uncased) | 340M | English | sub +-| [`bert-base-cased`](https://huggingface.co/bert-base-cased) | 110M | English | +-| [`bert-large-cased`](https://huggingface.co/bert-large-cased) | 340M | English | +-| [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 110M | Chinese | +-| [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple | +-| [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English | +-| [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English | ++| Model | #params | Language | ++|---------------------------------------------------------------------------------------------------------|-----------|------------| ++| [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M | English | ++| [`bert-large-uncased`](https://huggingface.co/bert-large-uncased) | 340M | English | ++| [`bert-base-cased`](https://huggingface.co/bert-base-cased) | 110M | English | ++| [`bert-large-cased`](https://huggingface.co/bert-large-cased) | 340M | English | ++| [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 110M | Chinese | ++| [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple | ++| [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English | ++| [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English | +@@ -65 +65 @@ +-You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to ++You can use the raw model for either masked language modeling or next sentence prediction, but it’s mostly intended to +@@ -197 +197 @@ +-the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a ++the other cases, it’s another random sentence in the corpus. Note that what is considered a sentence here is a +@@ -220,4 +220,3 @@ +-| Task | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | +-|:----:|:-----------:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:-------:| +-| | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | +- ++| Task | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | ++|--------|---------------|-------|--------|---------|--------|---------|--------|-------|-----------| ++| | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | +@@ -248 +247 @@ +- ++ diff --git a/skops/card/tests/examples/clip-vit-large-patch14.md b/skops/card/tests/examples/clip-vit-large-patch14.md new file mode 100644 index 00000000..cbbfa909 --- /dev/null +++ b/skops/card/tests/examples/clip-vit-large-patch14.md @@ -0,0 +1,147 @@ +--- +tags: +- vision +widget: +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png + candidate_labels: playing music, playing sports + example_title: Cat & Dog +--- + +# Model Card: CLIP + + + +Disclaimer: The model card is taken and modified from the official CLIP repository, it can be found [here](https://github.com/openai/CLIP/blob/main/model-card.md). + +## Model Details + +The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within. + +### Model Date + +January 2021 + +### Model Type + +The base model uses a ViT-L/14 Transformer architecture as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. + +The original implementation had two variants: one using a ResNet image encoder and the other using a Vision Transformer. This repository has the variant with the Vision Transformer. + + +### Documents + +- [Blog Post](https://openai.com/blog/clip/) +- [CLIP Paper](https://arxiv.org/abs/2103.00020) + + +### Use with Transformers + +```python +from PIL import Image +import requests + +from transformers import CLIPProcessor, CLIPModel + +model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14") +processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") + +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) + +inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True) + +outputs = model(**inputs) +logits_per_image = outputs.logits_per_image # this is the image-text similarity score +probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities +``` + + +## Model Use + +### Intended Use + +The model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such models - the CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis. + +#### Primary intended uses + +The primary intended users of these models are AI researchers. + +We primarily imagine the model will be used by researchers to better understand robustness, generalization, and other capabilities, biases, and constraints of computer vision models. + +### Out-of-Scope Use Cases + +**Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful. + +Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use. + +Since the model has not been purposefully trained in or evaluated on any languages other than English, its use should be limited to English language use cases. + + + +## Data + +The model was trained on publicly available image-caption data. This was done through a combination of crawling a handful of websites and using commonly-used pre-existing image datasets such as [YFCC100M](http://projects.dfki.uni-kl.de/yfcc100m/). A large portion of the data comes from our crawling of the internet. This means that the data is more representative of people and societies most connected to the internet which tend to skew towards more developed nations, and younger, male users. + +### Data Mission Statement + +Our goal with building this dataset was to test out robustness and generalizability in computer vision tasks. As a result, the focus was on gathering large quantities of data from different publicly-available internet data sources. The data was gathered in a mostly non-interventionist manner. However, we only crawled websites that had policies against excessively violent and adult images and allowed us to filter out such content. We do not intend for this dataset to be used as the basis for any commercial or deployed model and will not be releasing the dataset. + + + +## Performance and Limitations + +### Performance + +We have evaluated the performance of CLIP on a wide range of benchmarks across a variety of computer vision datasets such as OCR to texture recognition to fine-grained classification. The paper describes model performance on the following datasets: + +- Food101 +- CIFAR10 +- CIFAR100 +- Birdsnap +- SUN397 +- Stanford Cars +- FGVC Aircraft +- VOC2007 +- DTD +- Oxford-IIIT Pet dataset +- Caltech101 +- Flowers102 +- MNIST +- SVHN +- IIIT5K +- Hateful Memes +- SST-2 +- UCF101 +- Kinetics700 +- Country211 +- CLEVR Counting +- KITTI Distance +- STL-10 +- RareAct +- Flickr30 +- MSCOCO +- ImageNet +- ImageNet-A +- ImageNet-R +- ImageNet Sketch +- ObjectNet (ImageNet Overlap) +- Youtube-BB +- ImageNet-Vid + +## Limitations + +CLIP and our analysis of it have a number of limitations. CLIP currently struggles with respect to certain tasks such as fine grained classification and counting objects. CLIP also poses issues with regards to fairness and bias which we discuss in the paper and briefly in the next section. Additionally, our approach to testing CLIP also has an important limitation- in many cases we have used linear probes to evaluate the performance of CLIP and there is evidence suggesting that linear probes can underestimate model performance. + +### Bias and Fairness + +We find that the performance of CLIP - and the specific biases it exhibits - can depend significantly on class design and the choices one makes for categories to include and exclude. We tested the risk of certain kinds of denigration with CLIP by classifying images of people from [Fairface](https://arxiv.org/abs/1908.04913) into crime-related and non-human animal categories. We found significant disparities with respect to race and gender. Additionally, we found that these disparities could shift based on how the classes were constructed. (Details captured in the Broader Impacts Section in the paper). + +We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. + + + +## Feedback + +### Where to send questions or comments about the model + +Please use [this Google Form](https://forms.gle/Uv7afRH5dvY34ZEs9) diff --git a/skops/card/tests/examples/clip-vit-large-patch14.md.diff b/skops/card/tests/examples/clip-vit-large-patch14.md.diff new file mode 100644 index 00000000..f2da254b --- /dev/null +++ b/skops/card/tests/examples/clip-vit-large-patch14.md.diff @@ -0,0 +1,19 @@ +--- ++++ +@@ -30 +29,0 @@ +- +@@ -35 +33,0 @@ +- +@@ -58 +55,0 @@ +- +@@ -79,2 +75,0 @@ +- +- +@@ -88,2 +82,0 @@ +- +- +@@ -139,3 +132 @@ +-We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. +- +- ++We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with 'Middle Eastern' having the highest accuracy (98.4%) and 'White' having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. diff --git a/skops/card/tests/examples/gpt2.md b/skops/card/tests/examples/gpt2.md new file mode 100644 index 00000000..6481d600 --- /dev/null +++ b/skops/card/tests/examples/gpt2.md @@ -0,0 +1,168 @@ +--- +language: en +tags: +- exbert + +license: mit +--- + +# GPT-2 + + + +Test the whole generation capabilities here: https://transformer.huggingface.co/doc/gpt2-large + +Pretrained model on English language using a causal language modeling (CLM) objective. It was introduced in +[this paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) +and first released at [this page](https://openai.com/blog/better-language-models/). + +Disclaimer: The team releasing GPT-2 also wrote a +[model card](https://github.com/openai/gpt-2/blob/master/model_card.md) for their model. Content from this model card +has been written by the Hugging Face team to complete the information they provided and give specific examples of bias. + +## Model description + +GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This +means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots +of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, +it was trained to guess the next word in sentences. + +More precisely, inputs are sequences of continuous text of a certain length and the targets are the same sequence, +shifted one token (word or piece of word) to the right. The model uses internally a mask-mechanism to make sure the +predictions for the token `i` only uses the inputs from `1` to `i` but not the future tokens. + +This way, the model learns an inner representation of the English language that can then be used to extract features +useful for downstream tasks. The model is best at what it was pretrained for however, which is generating texts from a +prompt. + +This is the **smallest** version of GPT-2, with 124M parameters. + +**Related Models:** [GPT-Large](https://huggingface.co/gpt2-large), [GPT-Medium](https://huggingface.co/gpt2-medium) and [GPT-XL](https://huggingface.co/gpt2-xl) + +## Intended uses & limitations + +You can use the raw model for text generation or fine-tune it to a downstream task. See the +[model hub](https://huggingface.co/models?filter=gpt2) to look for fine-tuned versions on a task that interests you. + +### How to use + +You can use this model directly with a pipeline for text generation. Since the generation relies on some randomness, we +set a seed for reproducibility: + +```python +>>> from transformers import pipeline, set_seed +>>> generator = pipeline('text-generation', model='gpt2') +>>> set_seed(42) +>>> generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5) + +[{'generated_text': "Hello, I'm a language model, a language for thinking, a language for expressing thoughts."}, + {'generated_text': "Hello, I'm a language model, a compiler, a compiler library, I just want to know how I build this kind of stuff. I don"}, + {'generated_text': "Hello, I'm a language model, and also have more than a few of your own, but I understand that they're going to need some help"}, + {'generated_text': "Hello, I'm a language model, a system model. I want to know my language so that it might be more interesting, more user-friendly"}, + {'generated_text': 'Hello, I\'m a language model, not a language model"\n\nThe concept of "no-tricks" comes in handy later with new'}] +``` + +Here is how to use this model to get the features of a given text in PyTorch: + +```python +from transformers import GPT2Tokenizer, GPT2Model +tokenizer = GPT2Tokenizer.from_pretrained('gpt2') +model = GPT2Model.from_pretrained('gpt2') +text = "Replace me by any text you'd like." +encoded_input = tokenizer(text, return_tensors='pt') +output = model(**encoded_input) +``` + +and in TensorFlow: + +```python +from transformers import GPT2Tokenizer, TFGPT2Model +tokenizer = GPT2Tokenizer.from_pretrained('gpt2') +model = TFGPT2Model.from_pretrained('gpt2') +text = "Replace me by any text you'd like." +encoded_input = tokenizer(text, return_tensors='tf') +output = model(encoded_input) +``` + +### Limitations and bias + +The training data used for this model has not been released as a dataset one can browse. We know it contains a lot of +unfiltered content from the internet, which is far from neutral. As the openAI team themselves point out in their +[model card](https://github.com/openai/gpt-2/blob/master/model_card.md#out-of-scope-use-cases): + +> Because large-scale language models like GPT-2 do not distinguish fact from fiction, we don’t support use-cases +> that require the generated text to be true. +> +> Additionally, language models like GPT-2 reflect the biases inherent to the systems they were trained on, so we do +> not recommend that they be deployed into systems that interact with humans > unless the deployers first carry out a +> study of biases relevant to the intended use-case. We found no statistically significant difference in gender, race, +> and religious bias probes between 774M and 1.5B, implying all versions of GPT-2 should be approached with similar +> levels of caution around use cases that are sensitive to biases around human attributes. + +Here's an example of how the model can have biased predictions: + +```python +>>> from transformers import pipeline, set_seed +>>> generator = pipeline('text-generation', model='gpt2') +>>> set_seed(42) +>>> generator("The White man worked as a", max_length=10, num_return_sequences=5) + +[{'generated_text': 'The White man worked as a mannequin for'}, + {'generated_text': 'The White man worked as a maniser of the'}, + {'generated_text': 'The White man worked as a bus conductor by day'}, + {'generated_text': 'The White man worked as a plumber at the'}, + {'generated_text': 'The White man worked as a journalist. He had'}] + +>>> set_seed(42) +>>> generator("The Black man worked as a", max_length=10, num_return_sequences=5) + +[{'generated_text': 'The Black man worked as a man at a restaurant'}, + {'generated_text': 'The Black man worked as a car salesman in a'}, + {'generated_text': 'The Black man worked as a police sergeant at the'}, + {'generated_text': 'The Black man worked as a man-eating monster'}, + {'generated_text': 'The Black man worked as a slave, and was'}] +``` + +This bias will also affect all fine-tuned versions of this model. + +## Training data + +The OpenAI team wanted to train this model on a corpus as large as possible. To build it, they scraped all the web +pages from outbound links on Reddit which received at least 3 karma. Note that all Wikipedia pages were removed from +this dataset, so the model was not trained on any part of Wikipedia. The resulting dataset (called WebText) weights +40GB of texts but has not been publicly released. You can find a list of the top 1,000 domains present in WebText +[here](https://github.com/openai/gpt-2/blob/master/domains.txt). + +## Training procedure + +### Preprocessing + +The texts are tokenized using a byte-level version of Byte Pair Encoding (BPE) (for unicode characters) and a +vocabulary size of 50,257. The inputs are sequences of 1024 consecutive tokens. + +The larger model was trained on 256 cloud TPU v3 cores. The training duration was not disclosed, nor were the exact +details of training. + +## Evaluation results + +The model achieves the following results without any fine-tuning (zero-shot): + +| Dataset | LAMBADA | LAMBADA | CBT-CN | CBT-NE | WikiText2 | PTB | enwiki8 | text8 | WikiText103 | 1BW | +|:--------:|:-------:|:-------:|:------:|:------:|:---------:|:------:|:-------:|:------:|:-----------:|:-----:| +| (metric) | (PPL) | (ACC) | (ACC) | (ACC) | (PPL) | (PPL) | (BPB) | (BPC) | (PPL) | (PPL) | +| | 35.13 | 45.99 | 87.65 | 83.4 | 29.41 | 65.85 | 1.16 | 1,17 | 37.50 | 75.20 | + + +### BibTeX entry and citation info + +```bibtex +@article{radford2019language, + title={Language Models are Unsupervised Multitask Learners}, + author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya}, + year={2019} +} +``` + + + + diff --git a/skops/card/tests/examples/gpt2.md.diff b/skops/card/tests/examples/gpt2.md.diff new file mode 100644 index 00000000..e95bc0cb --- /dev/null +++ b/skops/card/tests/examples/gpt2.md.diff @@ -0,0 +1,22 @@ +--- ++++ +@@ -5 +4,0 @@ +- +@@ -95 +93,0 @@ +-> +@@ -102 +100 @@ +-Here's an example of how the model can have biased predictions: ++Here’s an example of how the model can have biased predictions: +@@ -150,5 +148,4 @@ +-| Dataset | LAMBADA | LAMBADA | CBT-CN | CBT-NE | WikiText2 | PTB | enwiki8 | text8 | WikiText103 | 1BW | +-|:--------:|:-------:|:-------:|:------:|:------:|:---------:|:------:|:-------:|:------:|:-----------:|:-----:| +-| (metric) | (PPL) | (ACC) | (ACC) | (ACC) | (PPL) | (PPL) | (BPB) | (BPC) | (PPL) | (PPL) | +-| | 35.13 | 45.99 | 87.65 | 83.4 | 29.41 | 65.85 | 1.16 | 1,17 | 37.50 | 75.20 | +- ++| Dataset | LAMBADA | CBT-CN | CBT-NE | WikiText2 | PTB | enwiki8 | text8 | WikiText103 | 1BW | ++|-----------|-----------|----------|----------|-------------|-------|-----------|---------|---------------|-------| ++| (metric) | (ACC) | (ACC) | (ACC) | (PPL) | (PPL) | (BPB) | (BPC) | (PPL) | (PPL) | ++| | 45.99 | 87.65 | 83.4 | 29.41 | 65.85 | 1.16 | 1,17 | 37.50 | 75.20 | +@@ -167 +164 @@ +- ++ diff --git a/skops/card/tests/examples/specter.md b/skops/card/tests/examples/specter.md new file mode 100644 index 00000000..e53e77ce --- /dev/null +++ b/skops/card/tests/examples/specter.md @@ -0,0 +1,26 @@ +--- +language: en +thumbnail: "https://camo.githubusercontent.com/7d080b7a769f7fdf64ac0ebeb47b039cb50be35287e3071f9d633f0fe33e7596/68747470733a2f2f692e6962622e636f2f33544331576d472f737065637465722d6c6f676f2d63726f707065642e706e67" +license: apache-2.0 +datasets: +- SciDocs +metrics: +- F1 +- accuracy +- map +- ndcg +--- + +## SPECTER + + + +SPECTER is a pre-trained language model to generate document-level embedding of documents. It is pre-trained on a a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, SPECTER can be easily applied to downstream applications without task-specific fine-tuning. + +Paper: [SPECTER: Document-level Representation Learning using Citation-informed Transformers](https://arxiv.org/pdf/2004.07180.pdf) + +Original Repo: [Github](https://github.com/allenai/specter) + +Evaluation Benchmark: [SciDocs](https://github.com/allenai/scidocs) + +Authors: *Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld* diff --git a/skops/card/tests/examples/specter.md.diff b/skops/card/tests/examples/specter.md.diff new file mode 100644 index 00000000..7fcfa951 --- /dev/null +++ b/skops/card/tests/examples/specter.md.diff @@ -0,0 +1,11 @@ +--- ++++ +@@ -3 +3 @@ +-thumbnail: "https://camo.githubusercontent.com/7d080b7a769f7fdf64ac0ebeb47b039cb50be35287e3071f9d633f0fe33e7596/68747470733a2f2f692e6962622e636f2f33544331576d472f737065637465722d6c6f676f2d63726f707065642e706e67" ++thumbnail: https://camo.githubusercontent.com/7d080b7a769f7fdf64ac0ebeb47b039cb50be35287e3071f9d633f0fe33e7596/68747470733a2f2f692e6962622e636f2f33544331576d472f737065637465722d6c6f676f2d63726f707065642e706e67 +@@ -14 +14 @@ +-## SPECTER ++# SPECTER +@@ -26 +26 @@ +-Authors: *Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld* ++Authors: _Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld_ diff --git a/skops/card/tests/examples/vit-base-patch32-224-in21k.md b/skops/card/tests/examples/vit-base-patch32-224-in21k.md new file mode 100644 index 00000000..570f5916 --- /dev/null +++ b/skops/card/tests/examples/vit-base-patch32-224-in21k.md @@ -0,0 +1,94 @@ +--- +license: apache-2.0 +tags: +- vision +datasets: +- imagenet-21k +inference: false +--- + +# Vision Transformer (base-sized model) + + + +Vision Transformer (ViT) model pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224. It was introduced in the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Dosovitskiy et al. and first released in [this repository](https://github.com/google-research/vision_transformer). However, the weights were converted from the [timm repository](https://github.com/rwightman/pytorch-image-models) by Ross Wightman, who already converted the weights from JAX to PyTorch. Credits go to him. + +Disclaimer: The team releasing ViT did not write a model card for this model so this model card has been written by the Hugging Face team. + +## Model description + +The Vision Transformer (ViT) is a transformer encoder model (BERT-like) pretrained on a large collection of images in a supervised fashion, namely ImageNet-21k, at a resolution of 224x224 pixels. + +Images are presented to the model as a sequence of fixed-size patches (resolution 32x32), which are linearly embedded. One also adds a [CLS] token to the beginning of a sequence to use it for classification tasks. One also adds absolute position embeddings before feeding the sequence to the layers of the Transformer encoder. + +Note that this model does not provide any fine-tuned heads, as these were zero'd by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification). + +By pre-training the model, it learns an inner representation of images that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled images for instance, you can train a standard classifier by placing a linear layer on top of the pre-trained encoder. One typically places a linear layer on top of the [CLS] token, as the last hidden state of this token can be seen as a representation of an entire image. + +## Intended uses & limitations + +You can use the raw model for image classification. See the [model hub](https://huggingface.co/models?search=google/vit) to look for +fine-tuned versions on a task that interests you. + +### How to use + +Here is how to use this model: + +```python +from transformers import ViTFeatureExtractor, ViTModel +from PIL import Image +import requests +url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +image = Image.open(requests.get(url, stream=True).raw) +feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch32-224-in21k') +model = ViTModel.from_pretrained('google/vit-base-patch32-224-in21k') +inputs = feature_extractor(images=image, return_tensors="pt") +outputs = model(**inputs) +last_hidden_state = outputs.last_hidden_state +``` + +Currently, both the feature extractor and model support PyTorch. Tensorflow and JAX/FLAX are coming soon, and the API of ViTFeatureExtractor might change. + +## Training data + +The ViT model was pretrained on [ImageNet-21k](http://www.image-net.org/), a dataset consisting of 14 million images and 21k classes. + +## Training procedure + +### Preprocessing + +The exact details of preprocessing of images during training/validation can be found [here](https://github.com/google-research/vision_transformer/blob/master/vit_jax/input_pipeline.py). + +Images are resized/rescaled to the same resolution (224x224) and normalized across the RGB channels with mean (0.5, 0.5, 0.5) and standard deviation (0.5, 0.5, 0.5). + +### Pretraining + +The model was trained on TPUv3 hardware (8 cores). All model variants are trained with a batch size of 4096 and learning rate warmup of 10k steps. For ImageNet, the authors found it beneficial to additionally apply gradient clipping at global norm 1. Pre-training resolution is 224. + +## Evaluation results + +For evaluation results on several image classification benchmarks, we refer to tables 2 and 5 of the original paper. Note that for fine-tuning, the best results are obtained with a higher resolution (384x384). Of course, increasing the model size will result in better performance. + +### BibTeX entry and citation info + +```bibtex +@misc{wu2020visual, + title={Visual Transformers: Token-based Image Representation and Processing for Computer Vision}, + author={Bichen Wu and Chenfeng Xu and Xiaoliang Dai and Alvin Wan and Peizhao Zhang and Zhicheng Yan and Masayoshi Tomizuka and Joseph Gonzalez and Kurt Keutzer and Peter Vajda}, + year={2020}, + eprint={2006.03677}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +```bibtex +@inproceedings{deng2009imagenet, + title={Imagenet: A large-scale hierarchical image database}, + author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li}, + booktitle={2009 IEEE conference on computer vision and pattern recognition}, + pages={248--255}, + year={2009}, + organization={Ieee} +} +``` diff --git a/skops/card/tests/examples/vit-base-patch32-224-in21k.md.diff b/skops/card/tests/examples/vit-base-patch32-224-in21k.md.diff new file mode 100644 index 00000000..cf849c95 --- /dev/null +++ b/skops/card/tests/examples/vit-base-patch32-224-in21k.md.diff @@ -0,0 +1,5 @@ +--- ++++ +@@ -24 +24 @@ +-Note that this model does not provide any fine-tuned heads, as these were zero'd by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification). ++Note that this model does not provide any fine-tuned heads, as these were zero’d by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification). diff --git a/skops/card/tests/test_parser.py b/skops/card/tests/test_parser.py index 87d54d84..c96717ec 100644 --- a/skops/card/tests/test_parser.py +++ b/skops/card/tests/test_parser.py @@ -1,3 +1,4 @@ +import difflib import os from pathlib import Path @@ -41,55 +42,60 @@ def card(fit_model, tmp_path): return card -def assert_readme_files_equal(file0, file1): - """Check that the two model cards are identical, but allow differences in - line breaks.""" - # exclude trivial case of both being empty - assert file0 - assert file1 +EXAMPLE_CARDS = [ + "bert-base-uncased.md", + "clip-vit-large-patch14.md", + "gpt2.md", + "specter.md", + "vit-base-patch32-224-in21k.md", +] + +def assert_readme_files_almost_equal(file0, file1, diff): + """Check that the two model cards are identical, but allow differences as + defined in the ``diff`` file""" with open(file0, "r") as f: readme0 = f.readlines() with open(file1, "r") as f: readme1 = f.readlines() - # remove completely empty lines - readme0 = [line.strip() for line in readme0 if line.strip()] - readme1 = [line.strip() for line in readme1 if line.strip()] + # exclude trivial case of both being empty + assert readme0 + assert readme1 - readme_str0 = "\n".join(readme0) - readme_str1 = "\n".join(readme1) + diff_actual = list(difflib.unified_diff(readme0, readme1, n=0)) - # a minuscule further difference is an excess empty line after - readme_str1 = readme_str1.replace("\n", "") + with open(diff, "r") as f: + diff_expected = f.readlines() - assert readme_str0 == readme_str1 + assert diff_actual == diff_expected -def test_parsed_card_identical(card, tmp_path): - file0 = tmp_path / "readme-skops.md" - card.save(file0) +@pytest.mark.parametrize("file_name", EXAMPLE_CARDS, ids=EXAMPLE_CARDS) +def test_example_model_cards(tmp_path, file_name): + """Test that the difference between original and parsed model card is + acceptable - parsed_card = parse_modelcard(file0) - file1 = tmp_path / "readme-parsed.md" - parsed_card.save(file1) + For this test, model cards for some of the most popular models on HF Hub + were retrieved and stored in the ./examples folder. This test checks that + these model cards can be successfully parsed and that the output is *almost* + the same. + + We don't expect the output to be 100% identical, see the limitations listed + in ``parse_modelcard``. Instead, we assert that the diff corresponds to the + expected diff, which is also checked in. - assert_readme_files_equal(file0, file1) + So e.g. for "specter.md", we expect that the diff will be the same diff as + in "specter.md.diff". + """ + path = Path(os.getcwd()) / "skops" / "card" / "tests" / "examples" + file0 = path / file_name + diff = (path / file_name).with_suffix(".md.diff") -@pytest.mark.xfail(reason="small diff, especially in tables") -def test_bert_base_uncased(tmp_path): - file0 = ( - Path(os.getcwd()) - / "skops" - / "card" - / "tests" - / "examples" - / "bert-base-uncased.md" - ) parsed_card = parse_modelcard(file0) file1 = tmp_path / "readme-parsed.md" parsed_card.save(file1) - assert_readme_files_equal(file0, file1) + assert_readme_files_almost_equal(file0, file1, diff) From 5e5c1a35324b3fb393751cad8331ef4ad45dabcf Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 6 Dec 2022 11:10:40 +0100 Subject: [PATCH 23/47] Rename Markdown class's methods, make private --- skops/card/_markup.py | 96 +++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/skops/card/_markup.py b/skops/card/_markup.py index 9ddf1afd..e330a529 100644 --- a/skops/card/_markup.py +++ b/skops/card/_markup.py @@ -17,8 +17,8 @@ class Markdown: This class has a ``mapping`` attribute, which is just a dict. The keys are Pandoc types and the values are functions that transform the corresponding value into a string with markdown syntax. Those functions are all prefixed - with ``md_``, e.g. ``md_image`` for transforming a pandoc ``Image`` into a - markdown figure. + with ``_``, e.g. ``_image`` for transforming a pandoc ``Image`` into a + markdown figure, or ``_raw_block``, to transform a pandoc ``RawBlock``. From the caller side, only the ``__call__`` method should be used, the rest should be considered internals. @@ -28,91 +28,91 @@ class Markdown: def __init__(self): # markdown syntax dispatch table self.mapping = { - "Space": self.md_space, - "Strong": self.md_strong, - "Emph": self.md_emph, - "Strikeout": self.md_strikeout, - "Subscript": self.md_subscript, - "Superscript": self.md_superscript, - "Plain": self.md_plain, - "Str": self.md_str, - "RawInline": self.md_rawline, - "RawBlock": self.md_raw_block, - "SoftBreak": self.md_softbreak, - "LineBreak": self.md_linebreak, - "Para": self.md_para, - "Header": self.md_header, - "Image": self.md_image, - "CodeBlock": self.md_code_block, - "Code": self.md_code, - "Table": self.md_table, - "Div": self.md_parse_div, - "Link": self.md_link, - "BulletList": self.md_bullet_list, - "Quoted": self.md_quoted, - "BlockQuote": self.md_block_quote, + "Space": self._space, + "Strong": self._strong, + "Emph": self._emph, + "Strikeout": self._strikeout, + "Subscript": self._subscript, + "Superscript": self._superscript, + "Plain": self._plain, + "Str": self._str, + "RawInline": self._raw_inline, + "RawBlock": self._raw_block, + "SoftBreak": self._soft_break, + "LineBreak": self._line_break, + "Para": self._para, + "Header": self._header, + "Image": self._image, + "CodeBlock": self._code_block, + "Code": self._code, + "Table": self._table, + "Div": self._parse_div, + "Link": self._link, + "BulletList": self._bullet_list, + "Quoted": self._quoted, + "BlockQuote": self._block_quote, } @staticmethod - def md_space(value) -> str: + def _space(value) -> str: return " " - def md_strong(self, value) -> str: + def _strong(self, value) -> str: parts = ["**"] parts += [self.__call__(subitem) for subitem in value] parts.append("**") return "".join(parts) - def md_emph(self, value) -> str: + def _emph(self, value) -> str: parts = ["_"] parts += [self.__call__(subitem) for subitem in value] parts.append("_") return "".join(parts) - def md_strikeout(self, value) -> str: + def _strikeout(self, value) -> str: parts = ["~~"] parts += [self.__call__(subitem) for subitem in value] parts.append("~~") return "".join(parts) - def md_subscript(self, value) -> str: + def _subscript(self, value) -> str: parts = [""] parts += [self.__call__(subitem) for subitem in value] parts.append("") return "".join(parts) - def md_superscript(self, value) -> str: + def _superscript(self, value) -> str: parts = [""] parts += [self.__call__(subitem) for subitem in value] parts.append("") return "".join(parts) - def md_plain(self, value) -> str: + def _plain(self, value) -> str: parts = [self.__call__(subitem) for subitem in value] return "".join(parts) @staticmethod - def md_str(value) -> str: + def _str(value) -> str: # escape \ return value.replace("\\", "\\\\") @staticmethod - def md_rawline(value) -> str: + def _raw_inline(value) -> str: _, line = value return line - def md_raw_block(self, item) -> str: + def _raw_block(self, item) -> str: # throw away the first item, which is just something like 'html' # might have to revisit this if output != markdown _, line = item return line @staticmethod - def md_softbreak(value) -> str: + def _soft_break(value) -> str: return "\n" @staticmethod - def md_linebreak(value) -> str: + def _line_break(value) -> str: return "\n" def _make_content(self, content): @@ -122,16 +122,16 @@ def _make_content(self, content): parts.append(part) return "".join(parts) - def md_para(self, value: list[dict[str, str]]) -> str: + def _para(self, value: list[dict[str, str]]) -> str: content = self._make_content(value) return content - def md_header(self, value: tuple[int, Any, list[dict[str, str]]]) -> str: + def _header(self, value: tuple[int, Any, list[dict[str, str]]]) -> str: level, _, content_parts = value section_name = self._make_content(content_parts) return section_name - def md_image(self, value) -> str: + def _image(self, value) -> str: (ident, _, keyvals), caption, (dest, typef) = value # it seems like ident and keyvals are not relevant for markdown assert caption @@ -142,7 +142,7 @@ def md_image(self, value) -> str: return content @staticmethod - def md_code_block(item: tuple[tuple[int, list[str], list[str]], str]) -> str: + def _code_block(item: tuple[tuple[int, list[str], list[str]], str]) -> str: # a codeblock consists of: (id, classes, namevals) contents (_, classes, _), content = item block_start = "```" @@ -153,11 +153,11 @@ def md_code_block(item: tuple[tuple[int, list[str], list[str]], str]) -> str: return content @staticmethod - def md_code(item: tuple[Any, str]) -> str: + def _code(item: tuple[Any, str]) -> str: _, txt = item return f"`{txt}`" - def md_table(self, item) -> str: + def _table(self, item) -> str: _, alignments, _, header, rows = item fn = self.__call__ columns = ["".join(fn(part) for part in col) for col in header] @@ -178,7 +178,7 @@ def md_table(self, item) -> str: res = TableSection(table).format() return res - def md_parse_div(self, item) -> str: + def _parse_div(self, item) -> str: # note that in markdown, we basically just use the raw html (ident, classes, kvs), contents = item @@ -206,12 +206,12 @@ def md_parse_div(self, item) -> str: end = "" return "".join([start] + middle + [end]) - def md_link(self, item) -> str: + def _link(self, item) -> str: _, txt, (src, _) = item txt_formatted = self._make_content(txt) return f"[{txt_formatted}]({src})" - def md_bullet_list(self, item) -> str: + def _bullet_list(self, item) -> str: parts = [] for subitem in item: assert len(subitem) == 1 @@ -221,7 +221,7 @@ def md_bullet_list(self, item) -> str: parts.append(f"- {content}") return "\n".join(parts) - def md_quoted(self, item: tuple[dict[str, str], list[PandocItem]]) -> str: + def _quoted(self, item: tuple[dict[str, str], list[PandocItem]]) -> str: quote_type, content = item type_ = quote_type["t"] try: @@ -236,7 +236,7 @@ def md_quoted(self, item: tuple[dict[str, str], list[PandocItem]]) -> str: text = "".join(self.__call__(i) for i in content) return f"{sym}{text}{sym}" - def md_block_quote(self, item: list[PandocItem]) -> str: + def _block_quote(self, item: list[PandocItem]) -> str: parts = [] for subitem in item: content = self.__call__(subitem) From 897975125782b8cf24cace741686bbd70706a29b Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 16 Dec 2022 15:50:10 +0100 Subject: [PATCH 24/47] Add version check for pandoc --- skops/card/_parser.py | 70 ++++++++++++++++++++++++++++++--- skops/card/tests/test_parser.py | 2 +- 2 files changed, 65 insertions(+), 7 deletions(-) diff --git a/skops/card/_parser.py b/skops/card/_parser.py index 933ea1d8..48037c5b 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -12,7 +12,7 @@ import subprocess from pathlib import Path from tempfile import mkdtemp -from typing import Any +from typing import Any, Sequence import yaml # type: ignore @@ -21,6 +21,8 @@ from ._markup import Markdown, PandocItem +PANDOC_MIN_VERSION = (2, 19, 0) + class PandocParser: """TODO""" @@ -95,20 +97,71 @@ def generate(self) -> Card: return self.card -def check_pandoc_installed() -> None: +def _get_pandoc_version() -> list[int]: + """Shell out to retrieve the pandoc version + + Raises + ------ + RuntimeError + If pandoc version could not be determined, raise a ``RuntimeError``. + + Returns + ------- + pandoc_version : list[int] + The pandoc version as a list of ints. + """ + proc = subprocess.run( + ["pandoc", "--version"], + capture_output=True, + ) + version_info = str(proc.stdout.decode("utf-8")).split("\n", 1)[0] + if not version_info.startswith("pandoc "): + raise RuntimeError("Could not determine version of pandoc") + + _, _, actual_version = version_info.partition(" ") + pandoc_version = [int(v) for v in actual_version.split(".")] + return pandoc_version + + +def _check_version_greater_equal( + version: Sequence[int], min_version: Sequence[int] +) -> None: + """Very bad version comparison function to ensure that the first version is + >= the second.""" + for v1, v2 in zip(version, min_version): + if v1 > v2: + return + + if v1 < v2: + raise ValueError( + "Pandoc version too low, expected at least " + f"{'.'.join(map(str, min_version))}" + ) + + +def check_pandoc_installed( + min_version: Sequence[int] | None = PANDOC_MIN_VERSION, +) -> None: """Check if pandoc is installed on the system + Parameters + ---------- + min_version : list[int] or None + If passed, check that the pandoc version is greater or equal to this one. + Raises ------ FileNotFoundError When the binary is not found, raise this error. + RuntimeError + If pandoc version could not be determined, raise a ``RuntimeError``. + + ValueError + If min version is passed and not matched or exceeded, raise a ``ValueError``. """ try: - subprocess.run( - ["pandoc", "--version"], - capture_output=True, - ) + pandoc_version = _get_pandoc_version() except FileNotFoundError as exc: msg = ( "This feature requires the pandoc library to be installed on your system, " @@ -117,6 +170,11 @@ def check_pandoc_installed() -> None: ) raise FileNotFoundError(msg) from exc + if not min_version: + return + + _check_version_greater_equal(pandoc_version, min_version) + def _card_with_detached_metainfo(path: str | Path) -> tuple[str | Path, dict[str, Any]]: """Detach the possibly existing yaml part of the model card diff --git a/skops/card/tests/test_parser.py b/skops/card/tests/test_parser.py index c96717ec..c904b1ff 100644 --- a/skops/card/tests/test_parser.py +++ b/skops/card/tests/test_parser.py @@ -13,7 +13,7 @@ check_pandoc_installed() except FileNotFoundError: # not installed, skip - pytest.skip(reason="These tests require pandoc", allow_module_level=True) + pytest.skip(reason="These tests require a recent pandoc", allow_module_level=True) @pytest.fixture From e8d6f6174964fca0a5084d1d9fb63a0e6bb02955 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 16 Dec 2022 15:53:36 +0100 Subject: [PATCH 25/47] New table parsing code There was a change in pandoc, this now works with the latest pandoc version. --- skops/card/_markup.py | 45 +++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/skops/card/_markup.py b/skops/card/_markup.py index e330a529..63128485 100644 --- a/skops/card/_markup.py +++ b/skops/card/_markup.py @@ -157,22 +157,47 @@ def _code(item: tuple[Any, str]) -> str: _, txt = item return f"`{txt}`" - def _table(self, item) -> str: - _, alignments, _, header, rows = item + def _table_cols(self, items) -> list[str]: + columns = [] + fn = self.__call__ + for item in items: + _, alignment, _, _, content = item + column = "".join(fn(part) for part in content) + columns.append(column) + return columns + + def _table_body(self, items) -> list[list[str]]: + body = [] fn = self.__call__ - columns = ["".join(fn(part) for part in col) for col in header] - if not columns: - raise ValueError("Table with no columns...") + for _, row_items in items: + row = [] + for col_row_item in row_items: + _, alignment, _, _, content = col_row_item + row.append("".join(fn(part) for part in content)) + body.append(row) + return body + + def _table(self, item) -> str: + # attr capt specs thead tbody tfoot + _, _, _, thead, tbody, _ = item + + # header + (_, thead_bodies) = thead + (attr, thead_body) = thead_bodies[0] # multiple headers? + + columns = self._table_cols(thead_body) - data = [] # row oriented - for row in rows: - data.append(["".join(fn(part) for part in col) for col in row]) + # rows + # attr rhc hd bd + _, _, _, trows = tbody[0] # multiple groups of rows? + body = self._table_body(trows) table: Mapping[str, Sequence[Any]] - if not data: + if not body: table = {key: [] for key in columns} else: - data_transposed = zip(*data) # column oriented + # body is row oriented, transpose to column oriented + data_transposed = zip(*body) table = {key: val for key, val in zip(columns, data_transposed)} res = TableSection(table).format() From d0e07bce2d36a6e66fa77fdef670a40d93ff84a5 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 16 Dec 2022 17:01:26 +0100 Subject: [PATCH 26/47] Fix "bug" with metainfo order For some reason, the order of items in the metainfo is no longer stable. Therefore, the tests comparing the parsed card vs original card failed. Now metainfo is excluded when comparing the cards. The metainfo is now checked separately, in a way that disregards the order. --- .../tests/examples/bert-base-uncased.md.diff | 10 +++++----- .../examples/clip-vit-large-patch14.md.diff | 12 +++++------ skops/card/tests/examples/gpt2.md.diff | 10 ++++------ skops/card/tests/examples/specter.md.diff | 5 +---- .../vit-base-patch32-224-in21k.md.diff | 2 +- skops/card/tests/test_parser.py | 20 ++++++++++++++++++- 6 files changed, 36 insertions(+), 23 deletions(-) diff --git a/skops/card/tests/examples/bert-base-uncased.md.diff b/skops/card/tests/examples/bert-base-uncased.md.diff index e4fb5c66..2367a8d8 100644 --- a/skops/card/tests/examples/bert-base-uncased.md.diff +++ b/skops/card/tests/examples/bert-base-uncased.md.diff @@ -1,6 +1,6 @@ --- +++ -@@ -52,10 +52,10 @@ +@@ -44,10 +44,10 @@ -| Model | #params | Language | -|------------------------|--------------------------------|-------| -| [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M | English | @@ -21,13 +21,13 @@ +| [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple | +| [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English | +| [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English | -@@ -65 +65 @@ +@@ -57 +57 @@ -You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to +You can use the raw model for either masked language modeling or next sentence prediction, but it’s mostly intended to -@@ -197 +197 @@ +@@ -189 +189 @@ -the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a +the other cases, it’s another random sentence in the corpus. Note that what is considered a sentence here is a -@@ -220,4 +220,3 @@ +@@ -212,4 +212,3 @@ -| Task | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | -|:----:|:-----------:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:-------:| -| | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | @@ -35,6 +35,6 @@ +| Task | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | +|--------|---------------|-------|--------|---------|--------|---------|--------|-------|-----------| +| | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | -@@ -248 +247 @@ +@@ -240 +239 @@ - + diff --git a/skops/card/tests/examples/clip-vit-large-patch14.md.diff b/skops/card/tests/examples/clip-vit-large-patch14.md.diff index f2da254b..8516f684 100644 --- a/skops/card/tests/examples/clip-vit-large-patch14.md.diff +++ b/skops/card/tests/examples/clip-vit-large-patch14.md.diff @@ -1,18 +1,18 @@ --- +++ -@@ -30 +29,0 @@ +@@ -23 +22,0 @@ - -@@ -35 +33,0 @@ +@@ -28 +26,0 @@ - -@@ -58 +55,0 @@ +@@ -51 +48,0 @@ - -@@ -79,2 +75,0 @@ +@@ -72,2 +68,0 @@ - - -@@ -88,2 +82,0 @@ +@@ -81,2 +75,0 @@ - - -@@ -139,3 +132 @@ +@@ -132,3 +125 @@ -We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. - - diff --git a/skops/card/tests/examples/gpt2.md.diff b/skops/card/tests/examples/gpt2.md.diff index e95bc0cb..ee0d38f1 100644 --- a/skops/card/tests/examples/gpt2.md.diff +++ b/skops/card/tests/examples/gpt2.md.diff @@ -1,13 +1,11 @@ --- +++ -@@ -5 +4,0 @@ -- -@@ -95 +93,0 @@ +@@ -89 +88,0 @@ -> -@@ -102 +100 @@ +@@ -96 +95 @@ -Here's an example of how the model can have biased predictions: +Here’s an example of how the model can have biased predictions: -@@ -150,5 +148,4 @@ +@@ -144,5 +143,4 @@ -| Dataset | LAMBADA | LAMBADA | CBT-CN | CBT-NE | WikiText2 | PTB | enwiki8 | text8 | WikiText103 | 1BW | -|:--------:|:-------:|:-------:|:------:|:------:|:---------:|:------:|:-------:|:------:|:-----------:|:-----:| -| (metric) | (PPL) | (ACC) | (ACC) | (ACC) | (PPL) | (PPL) | (BPB) | (BPC) | (PPL) | (PPL) | @@ -17,6 +15,6 @@ +|-----------|-----------|----------|----------|-------------|-------|-----------|---------|---------------|-------| +| (metric) | (ACC) | (ACC) | (ACC) | (PPL) | (PPL) | (BPB) | (BPC) | (PPL) | (PPL) | +| | 45.99 | 87.65 | 83.4 | 29.41 | 65.85 | 1.16 | 1,17 | 37.50 | 75.20 | -@@ -167 +164 @@ +@@ -161 +159 @@ - + diff --git a/skops/card/tests/examples/specter.md.diff b/skops/card/tests/examples/specter.md.diff index 7fcfa951..647c85a8 100644 --- a/skops/card/tests/examples/specter.md.diff +++ b/skops/card/tests/examples/specter.md.diff @@ -1,11 +1,8 @@ --- +++ @@ -3 +3 @@ --thumbnail: "https://camo.githubusercontent.com/7d080b7a769f7fdf64ac0ebeb47b039cb50be35287e3071f9d633f0fe33e7596/68747470733a2f2f692e6962622e636f2f33544331576d472f737065637465722d6c6f676f2d63726f707065642e706e67" -+thumbnail: https://camo.githubusercontent.com/7d080b7a769f7fdf64ac0ebeb47b039cb50be35287e3071f9d633f0fe33e7596/68747470733a2f2f692e6962622e636f2f33544331576d472f737065637465722d6c6f676f2d63726f707065642e706e67 -@@ -14 +14 @@ -## SPECTER +# SPECTER -@@ -26 +26 @@ +@@ -15 +15 @@ -Authors: *Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld* +Authors: _Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld_ diff --git a/skops/card/tests/examples/vit-base-patch32-224-in21k.md.diff b/skops/card/tests/examples/vit-base-patch32-224-in21k.md.diff index cf849c95..b48c0b73 100644 --- a/skops/card/tests/examples/vit-base-patch32-224-in21k.md.diff +++ b/skops/card/tests/examples/vit-base-patch32-224-in21k.md.diff @@ -1,5 +1,5 @@ --- +++ -@@ -24 +24 @@ +@@ -17 +17 @@ -Note that this model does not provide any fine-tuned heads, as these were zero'd by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification). +Note that this model does not provide any fine-tuned heads, as these were zero’d by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification). diff --git a/skops/card/tests/test_parser.py b/skops/card/tests/test_parser.py index c904b1ff..b723c03e 100644 --- a/skops/card/tests/test_parser.py +++ b/skops/card/tests/test_parser.py @@ -4,6 +4,7 @@ import numpy as np import pytest +import yaml # type: ignore from sklearn.linear_model import LinearRegression from skops.card import Card, parse_modelcard @@ -51,15 +52,32 @@ def card(fit_model, tmp_path): ] +def _assert_meta_equal(meta0, meta1): + # we cannot guarantee the order of metadata items, so we compare parsed + # dicts, but not strings directly + assert yaml.safe_load("".join(meta0)) == yaml.safe_load("".join(meta1)) + + def assert_readme_files_almost_equal(file0, file1, diff): """Check that the two model cards are identical, but allow differences as - defined in the ``diff`` file""" + defined in the ``diff`` file + + The metainfo is compared separately, as the order of the items is not + guaranteed to be stable. + """ with open(file0, "r") as f: readme0 = f.readlines() with open(file1, "r") as f: readme1 = f.readlines() + sep = "---\n" + idx0, idx1 = readme0[1:].index(sep) + 1, readme1[1:].index(sep) + 1 + meta0, meta1 = readme0[1:idx0], readme1[1:idx1] + readme0, readme1 = readme0[idx0:], readme1[idx1:] + + _assert_meta_equal(meta0, meta1) + # exclude trivial case of both being empty assert readme0 assert readme1 From 5a9f8261368215b74ebac2d9fc2bf773bb3164a6 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 16 Dec 2022 17:10:15 +0100 Subject: [PATCH 27/47] Clean up noise created from merge conflict --- docs/model_card.rst | 4 ---- skops/card/_model_card.py | 1 - 2 files changed, 5 deletions(-) diff --git a/docs/model_card.rst b/docs/model_card.rst index f6b0d37a..7a6c124d 100644 --- a/docs/model_card.rst +++ b/docs/model_card.rst @@ -103,7 +103,3 @@ using :meth:`Card.select`, and you can delete sections using To see how you can use the API in ``skops`` to create a model card, please refer to :ref:`sphx_glr_auto_examples_plot_model_card.py`. - -Templates ---------- -TODO diff --git a/skops/card/_model_card.py b/skops/card/_model_card.py index dadeb0f6..2adafa62 100644 --- a/skops/card/_model_card.py +++ b/skops/card/_model_card.py @@ -728,7 +728,6 @@ def add_model_plot( ------- self : object Card object. - """ if not self.model_diagram: return self From 5bc590972def0438eed34b5ef9f8f6580617e920 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 16 Dec 2022 17:25:35 +0100 Subject: [PATCH 28/47] Fix TypedDict import for Py<3.9, doctest --- skops/card/_markup.py | 8 +++++++- skops/card/_parser.py | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/skops/card/_markup.py b/skops/card/_markup.py index 63128485..de657d83 100644 --- a/skops/card/_markup.py +++ b/skops/card/_markup.py @@ -1,10 +1,16 @@ """Classes for translating into the syntax of different markup languages""" +import sys from collections.abc import Mapping -from typing import Any, Sequence, TypedDict +from typing import Any, Sequence from skops.card._model_card import TableSection +if sys.version_info.minor >= 9: + from typing import TypedDict +else: + from typing_extensions import TypedDict + class PandocItem(TypedDict): t: str diff --git a/skops/card/_parser.py b/skops/card/_parser.py index 48037c5b..f6e22aea 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -239,7 +239,7 @@ def parse_modelcard(path: str | Path) -> Card: >>> import numpy as np >>> from sklearn.linear_model import LinearRegression >>> from skops.card import Card - >>> from skops.card import parse_card + >>> from skops.card import parse_modelcard >>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) >>> y = np.dot(X, np.array([1, 2])) + 3 >>> regr = LinearRegression().fit(X, y) @@ -249,6 +249,7 @@ def parse_modelcard(path: str | Path) -> Card: >>> parsed_card = parse_modelcard("README.md") >>> # continue editing the card >>> parsed_card.add(**{"My new section": "My new content"}) + Card(...) >>> # overwrite old card with new one >>> parsed_card.save("README.md") From 3c3ff89a24f67829e860ac107136b82450d36970 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 16 Dec 2022 17:29:27 +0100 Subject: [PATCH 29/47] Add future annotations import to _markup.py --- skops/card/_markup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/skops/card/_markup.py b/skops/card/_markup.py index de657d83..531177db 100644 --- a/skops/card/_markup.py +++ b/skops/card/_markup.py @@ -1,5 +1,7 @@ """Classes for translating into the syntax of different markup languages""" +from __future__ import annotations + import sys from collections.abc import Mapping from typing import Any, Sequence From c2ccfcc4a199d09b275322e14a165bb76c16ac9c Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 19 Dec 2022 13:00:43 +0100 Subject: [PATCH 30/47] Add more documentation, refactor parser class The parser is now less stateful. --- skops/card/_model_card.py | 5 ++ skops/card/_parser.py | 115 +++++++++++++++++++++++++------------- 2 files changed, 82 insertions(+), 38 deletions(-) diff --git a/skops/card/_model_card.py b/skops/card/_model_card.py index 2adafa62..ed7bedf1 100644 --- a/skops/card/_model_card.py +++ b/skops/card/_model_card.py @@ -683,6 +683,11 @@ def _add_single(self, key: str, val: Formattable | str) -> Section: val: str or Formattable The value to assign to the (sub)section. + Returns + ------- + Section instance + The section that has been added or modified. + """ *subsection_names, leaf_node_name = split_subsection_names(key) section = self._select(subsection_names) diff --git a/skops/card/_parser.py b/skops/card/_parser.py index f6e22aea..7f051b04 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -12,7 +12,7 @@ import subprocess from pathlib import Path from tempfile import mkdtemp -from typing import Any, Sequence +from typing import Any, Literal, Sequence import yaml # type: ignore @@ -25,35 +25,57 @@ class PandocParser: - """TODO""" + """Create model cards from files parsed through pandoc. - def __init__(self, source, mapping="markdown") -> None: - self.source = source - if mapping == "markdown": - self.mapping = Markdown() - else: - raise ValueError(f"Markup of type {mapping} is not supported (yet)") + This class knows about the implementation details of the + :class:`~skops.card.Card` and generates it by initializing an empty class + and then calling its methods with the input provided by pandoc. - self.card = Card(None, template=None) - self._section_trace: list[str] = [] - self._cur_section: Section | None = None + ``PandocParser`` does not know about any specific markup type, such as + markdown. Instead, it is initialized with a ``Mapping``, which is + responsible to convert pandoc input into the desired markup language. - def get_cur_level(self) -> int: - # level 0 can be interpreted implictly as the root level - return len(self._section_trace) + After initializing this class, call + :meth:`~skops.card._parser.PandocParser.generate` to generate the resulting + :class:`~skops.card.Card` instance. + + Parameters + ---------- + source : str + The model card parsed using the ``pandoc -t json`` option. - def get_cur_section(self): - # including supersections - return "/".join(self._section_trace) + markup_type : "markdown" + The type of markup that was used for the model card. Right now, only + ``"markdown"`` is supported. - def add_section(self, section_name: str) -> None: - self._cur_section = self.card._add_single(self.get_cur_section(), "") + """ - def add_content(self, content: str) -> None: - section = self._cur_section + def __init__( + self, source: str, markup_type: Literal["markdown"] = "markdown" + ) -> None: + self.source = source + if markup_type.lower() == "markdown": + self.mapping = Markdown() + else: + raise ValueError(f"Markup of type {markup_type} is not supported (yet)") + + def _add_section( + self, section_name: str, card: Card, section_trace: list[str] + ) -> Section: + # Add a new section to the card, which can be a subsection, and return + # it. + section_name = "/".join(section_trace) + cur_section = card._add_single(section_name, "") + return cur_section + + def _add_content(self, content: str, section: Section | None) -> None: + # Add content to the current section if section is None: + # This may occur if the model card starts without a section. This is + # not illegal in markdown, but we don't handle it yet. raise ValueError( - "Ooops, no current section, please open an issue on GitHub" + "Trying to add content but there is no current section, " + "this is probably a bug, please open an issue on GitHub" ) if not section.content: @@ -61,40 +83,57 @@ def add_content(self, content: str) -> None: elif isinstance(section.content, str): section.content = section.content + "\n\n" + content else: - # A Formattable, no generic way to modify it -- should we add an - # update method? + # TODO: Content is a Formattable, no generic way to modify it -- + # should we require each Formattable to have an update method? raise ValueError(f"Could not modify content of {section.content}") - def parse_header(self, item: PandocItem) -> str: + def _parse_header( + self, item: PandocItem, section_trace: list[str] + ) -> tuple[str, int]: # Headers are the only type of item that needs to be handled # differently. This is because we structure the underlying model card # data as a tree with nodes corresponding to headers. To assign the # right parent or child node, we need to keep track of the level of the - # headers. This cannot be done solely by the markdown mapping, since it - # is not aware of the tree structure. + # headers. This cannot be done on the level of the markdown mapping, + # since it is not aware of the tree structure. level, _, _ = item["c"] content = self.mapping(item) - self._section_trace = self._section_trace[: level - 1] + [content] - return content + return content, level - def post_process(self, res: str) -> str: + def _post_process(self, res: str) -> str: # replace Latin1 space res = res.replace("\xa0", " ") return res def generate(self) -> Card: - # Parsing the flat structure, not recursively as in pandocfilters. - # After visiting the parent node, it's not necessary to visit its - # child nodes, because that's already done during parsing. + """Generate the model card instance from the parsed card. + + Returns + ------- + card : :class:`~skops.card.Card` + The parsed model card instance. If not further modified, the output + of saving that card should be (almost) identical to the initial + model card. + """ + section: Section | None = None + section_trace: list[str] = [] + card = Card(None, template=None) + + # Parsing the flat structure, not recursively as in pandocfilters. After + # visiting the parent node, it's not necessary to visit its child nodes, + # because the mapping class already takes care of visiting the child + # nodes. for item in json.loads(self.source)["blocks"]: if item["t"] == "Header": - res = self.post_process(self.parse_header(item)) - self.add_section(res) + content, level = self._parse_header(item, section_trace=section_trace) + res = self._post_process(content) + section_trace = section_trace[: level - 1] + [res] + section = self._add_section(res, card=card, section_trace=section_trace) else: - res = self.post_process(self.mapping(item)) - self.add_content(res) + res = self._post_process(self.mapping(item)) + self._add_content(res, section=section) - return self.card + return card def _get_pandoc_version() -> list[int]: From a2f90a457e245a14b58a8bb5d224c9cea12317c6 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 2 Jan 2023 12:23:46 +0100 Subject: [PATCH 31/47] Skip parts of parse_modelcard docstring test This is because to run this doctest would require pandoc to be installed, but running the doctests should not have a dependency on pandoc. --- skops/card/_parser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skops/card/_parser.py b/skops/card/_parser.py index 7f051b04..e0af5723 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -283,14 +283,14 @@ def parse_modelcard(path: str | Path) -> Card: >>> y = np.dot(X, np.array([1, 2])) + 3 >>> regr = LinearRegression().fit(X, y) >>> card = Card(regr) - >>> card.save("README.md") + >>> card.save("README.md") # doctest: +SKIP >>> # later, load the card again - >>> parsed_card = parse_modelcard("README.md") + >>> parsed_card = parse_modelcard("README.md") # doctest: +SKIP >>> # continue editing the card - >>> parsed_card.add(**{"My new section": "My new content"}) + >>> parsed_card.add(**{"My new section": "My new content"}) # doctest: +SKIP Card(...) >>> # overwrite old card with new one - >>> parsed_card.save("README.md") + >>> parsed_card.save("README.md") # doctest: +SKIP Notes ----- From 7ececc1e589c8b2680e11243842dbbadc489d7e2 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 3 Jan 2023 18:05:23 +0100 Subject: [PATCH 32/47] Increase test coverage, add a few features - Checking a few more error cases - Document lines that are not covered - Add support for more syntax, most notably: - nested bullet lists - nested ordered lists --- skops/card/_markup.py | 113 ++++++----- skops/card/_parser.py | 6 + skops/card/tests/examples/toy-example.md | 182 ++++++++++++++++++ skops/card/tests/examples/toy-example.md.diff | 90 +++++++++ skops/card/tests/test_parser.py | 62 +++++- 5 files changed, 403 insertions(+), 50 deletions(-) create mode 100644 skops/card/tests/examples/toy-example.md create mode 100644 skops/card/tests/examples/toy-example.md.diff diff --git a/skops/card/_markup.py b/skops/card/_markup.py index 531177db..8ffa14b6 100644 --- a/skops/card/_markup.py +++ b/skops/card/_markup.py @@ -4,6 +4,7 @@ import sys from collections.abc import Mapping +from contextlib import contextmanager from typing import Any, Sequence from skops.card._model_card import TableSection @@ -37,17 +38,14 @@ def __init__(self): # markdown syntax dispatch table self.mapping = { "Space": self._space, + "Plain": self._plain, + "Str": self._str, "Strong": self._strong, "Emph": self._emph, "Strikeout": self._strikeout, - "Subscript": self._subscript, - "Superscript": self._superscript, - "Plain": self._plain, - "Str": self._str, "RawInline": self._raw_inline, "RawBlock": self._raw_block, "SoftBreak": self._soft_break, - "LineBreak": self._line_break, "Para": self._para, "Header": self._header, "Image": self._image, @@ -57,14 +55,41 @@ def __init__(self): "Div": self._parse_div, "Link": self._link, "BulletList": self._bullet_list, + "OrderedList": self._ordered_list, "Quoted": self._quoted, "BlockQuote": self._block_quote, } + # Start indentation level at -1 because we want the first incremented + # indentation level to be at 0. Otherwise we would need to keep track if + # it's the first time and then don't increment, which is more + # complicated. + self._indent_trace = [] + + @contextmanager + def _indented(self, *, spaces: int): + """Temporarily increment indentation by one""" + self._indent_trace.append(spaces) + yield + self._indent_trace.pop(-1) + + def _get_indent(self, *, incr: int = 0) -> str: + """Get current indentation, optionally incremented""" + # TODO: explain why skipping 1st item + return " " * (incr + sum(self._indent_trace[:-1])) @staticmethod def _space(value) -> str: return " " + def _plain(self, value) -> str: + parts = [self.__call__(subitem) for subitem in value] + return "".join(parts) + + @staticmethod + def _str(value) -> str: + # escape \ + return value.replace("\\", "\\\\") + def _strong(self, value) -> str: parts = ["**"] parts += [self.__call__(subitem) for subitem in value] @@ -83,27 +108,6 @@ def _strikeout(self, value) -> str: parts.append("~~") return "".join(parts) - def _subscript(self, value) -> str: - parts = [""] - parts += [self.__call__(subitem) for subitem in value] - parts.append("") - return "".join(parts) - - def _superscript(self, value) -> str: - parts = [""] - parts += [self.__call__(subitem) for subitem in value] - parts.append("") - return "".join(parts) - - def _plain(self, value) -> str: - parts = [self.__call__(subitem) for subitem in value] - return "".join(parts) - - @staticmethod - def _str(value) -> str: - # escape \ - return value.replace("\\", "\\\\") - @staticmethod def _raw_inline(value) -> str: _, line = value @@ -115,13 +119,9 @@ def _raw_block(self, item) -> str: _, line = item return line - @staticmethod - def _soft_break(value) -> str: - return "\n" - - @staticmethod - def _line_break(value) -> str: - return "\n" + def _soft_break(self, value) -> str: + incr = 0 if not self._indent_trace else self._indent_trace[-1] + return "\n" + self._get_indent(incr=incr) def _make_content(self, content): parts = [] @@ -142,10 +142,16 @@ def _header(self, value: tuple[int, Any, list[dict[str, str]]]) -> str: def _image(self, value) -> str: (ident, _, keyvals), caption, (dest, typef) = value # it seems like ident and keyvals are not relevant for markdown - assert caption - assert typef == "fig:" - caption = "".join([self.__call__(i) for i in caption]) + if not caption: + # not sure if this can be reached, just to be safe + raise ValueError("Figure missing a caption") + + if not typef.startswith("fig:"): + # not sure if this can be reached, just to be safe + raise ValueError(f"Cannot deal with figure of type '{typef}'") + + caption = "".join(self.__call__(i) for i in caption) content = f"![{caption}]({dest})" return content @@ -215,7 +221,7 @@ def _parse_div(self, item) -> str: # note that in markdown, we basically just use the raw html (ident, classes, kvs), contents = item - # build diff tag + # build div tag tags = [" str: start = "".join(tags) middle = [] for content in contents: - middle.append(self.__call__(content)) + with self._indented(spaces=2): + middle.append(self.__call__(content)) end = "" return "".join([start] + middle + [end]) @@ -244,14 +251,31 @@ def _link(self, item) -> str: txt_formatted = self._make_content(txt) return f"[{txt_formatted}]({src})" + def _make_list_item(self, items: str, list_marker: str): + # helper function used for bullet and ordered lists + parts = [self.__call__(subitem) for subitem in items] + content = "\n".join(parts) + return f"{self._get_indent()}{list_marker} {content}" + def _bullet_list(self, item) -> str: + # we don't differentiate between lists starting with "-", "*", or "+". + list_marker = "-" parts = [] - for subitem in item: - assert len(subitem) == 1 - content = "".join(self.__call__(i) for i in subitem) - # indent the lines in lists if they contain line breaks - content = content.replace("\n", "\n ") - parts.append(f"- {content}") + # bullet lists use 2 spaces for indentation to align "- " + with self._indented(spaces=2): + for subitem in item: + parts.append(self._make_list_item(subitem, list_marker=list_marker)) + return "\n".join(parts) + + def _ordered_list(self, item) -> str: + # we don't make use of num_type and sep_type, which just indicates that + # numbers are presented as decimal numbers using a period + (start, num_type, sep_type), items = item + parts = [] + # ordered lists use 3 spaces for indentation to align "1. " + with self._indented(spaces=3): + for i, subitem in enumerate(items, start=start): + parts.append(self._make_list_item(subitem, list_marker=f"{i}.")) return "\n".join(parts) def _quoted(self, item: tuple[dict[str, str], list[PandocItem]]) -> str: @@ -260,6 +284,7 @@ def _quoted(self, item: tuple[dict[str, str], list[PandocItem]]) -> str: try: sym = {"DoubleQuote": '"', "SingleQuote": "'"}[type_] except KeyError as exc: + # can probably not be reached, but let's be sure msg = ( f"The parsed document contains '{type_}', which is not " "supported yet, please open an issue on GitHub" diff --git a/skops/card/_parser.py b/skops/card/_parser.py index e0af5723..8236f6fd 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -103,6 +103,10 @@ def _parse_header( def _post_process(self, res: str) -> str: # replace Latin1 space res = res.replace("\xa0", " ") + + # pandoc creates ☒ and ☐ for to do items but GitHub requires [x] and [ ] + # for an item to be considered a to do item + res = res.replace("- ☒", "- [x]").replace("- ☐", "- [ ]") return res def generate(self) -> Card: @@ -308,6 +312,8 @@ def parse_modelcard(path: str | Path) -> Card: - Quote symbols may differ, e.g. ``it’s`` becoming ``it's``. - The number of empty lines may differ, e.g. two empty lines being transformed into one empty line. + - The optional title of links is not preserved, as e.g. in + `[text](https://example.com "this disappears")` - Trailing whitespace is removed. - Tab indentation may be removed, e.g. in raw html. - The yaml part of the model card can have some non-semantic differences, diff --git a/skops/card/tests/examples/toy-example.md b/skops/card/tests/examples/toy-example.md new file mode 100644 index 00000000..44669f11 --- /dev/null +++ b/skops/card/tests/examples/toy-example.md @@ -0,0 +1,182 @@ +# This document tries to cover many common markdown contents + +This is not based on an existing model card and serves to increase test coverage. It also documents differences that may be found after parsing. There is no metainfo section. + +## H2 + +### H3 + +#### H4 + +##### H5 + +###### H6 + +Parser 'preserves' some "quotation" marks. + +Parser doesn’t ‘preserve’ other “quotation” marks. + +## Italics + +One _way_ of doing it. +Another *way* of doing it. + +## Bold + +One __way__ of doing it. +Another **way** of doing it. + +## Strikethrough + +This is ~~not~~ the way. + +## Superscript and subscripts + +Really just html tags. + +E = mc2 + +log2 + +## Bullet lists + +Pandoc does not differentiate between different notations, so we always use -, not * or +. + +* using +* asterisk + +or + +- using +- minus + with line break + +or + ++ using plus + +Finally: + +- nesting + - is +- indeed + - very + - possible + - to achieve + +## Ordered lists + +1. a normal +2. ordered list + +or + +1. an ordered +2. list + 1. with + 2. indentation +3. is possible + +## Mixed lists + +1. it’s +2. possible + - to + - mix +3. ordered _and_ unorderd + +## TODOs + +- [x] This +- [ ] is +- [x] **done** + +## Links + +[a link](https://skops.readthedocs.io/) + +The "title" is not parsed by pandoc + +[a link](https://skops.readthedocs.io/ "this disappears") + +[a link to a file](./toy-example.md) + +References are resolved, so `[1]` below is replaced by the actual link: + +[a link with reference][1] + +A plain link to https://skops.readthedocs.io/ used inside of text. + +[1]: https://skops.readthedocs.io/ + +## Images + +![skops logo](https://github.com/skops-dev/skops/blob/main/docs/images/logo.png) + +### Using html + +logo + +## Quotes + +> Someone said something importent + +> I quote wise words: +> > Someone said something importent + +## Tables + +| Header 0 | Header 1 | +|--------------|----------------| +| Some content | More content | +| _Even more_ | This is **it** | + +Empty tables are legal + +| What now? | +|-------------| + +## Inline code + +Some `inline` code. + +`A whole line` + +## Code blocks + +``` +A raw + +code block +``` + +With language + +```python +def foo(): + return 0 + +def bar(): + return 1 +``` + +## Raw HTML + + +
+
Beast of Bodmin
+
A large feline inhabiting Bodmin Moor.
+ +
Morgawr
+
A sea serpent.
+ +
Owlman
+
A giant owl-like creature.
+
+ +## Div + +The "id" tag may change in order +
+

Divs are possible

+
diff --git a/skops/card/tests/examples/toy-example.md.diff b/skops/card/tests/examples/toy-example.md.diff new file mode 100644 index 00000000..9c00ed2a --- /dev/null +++ b/skops/card/tests/examples/toy-example.md.diff @@ -0,0 +1,90 @@ +--- ++++ +@@ -0,0 +1 @@ ++ +@@ -17 +18 @@ +-Parser doesn’t ‘preserve’ other “quotation” marks. ++Parser doesn’t 'preserve' other "quotation" marks. +@@ -22 +23 @@ +-Another *way* of doing it. ++Another _way_ of doing it. +@@ -26 +27 @@ +-One __way__ of doing it. ++One **way** of doing it. +@@ -45,2 +46,2 @@ +-* using +-* asterisk ++- using ++- asterisk +@@ -56 +57 @@ +-+ using plus ++- using plus +@@ -100 +101 @@ +-[a link](https://skops.readthedocs.io/ "this disappears") ++[a link](https://skops.readthedocs.io/) +@@ -106 +107 @@ +-[a link with reference][1] ++[a link with reference](https://skops.readthedocs.io/) +@@ -109,2 +109,0 @@ +- +-[1]: https://skops.readthedocs.io/ +@@ -164 +163,6 @@ +- ++ ++ +@@ -167,8 +171,37 @@ +-
Beast of Bodmin
+-
A large feline inhabiting Bodmin Moor.
+- +-
Morgawr
+-
A sea serpent.
+- +-
Owlman
+-
A giant owl-like creature.
++ ++
++ ++Beast of Bodmin ++ ++
++ ++
++ ++A large feline inhabiting Bodmin Moor. ++ ++
++ ++
++ ++Morgawr ++ ++
++ ++
++ ++A sea serpent. ++ ++
++ ++
++ ++Owlman ++ ++
++ ++
++ ++A giant owl-like creature. ++ ++
++ +@@ -180,3 +213,2 @@ +-
+-

Divs are possible

+-
++ ++

Divs are possible

diff --git a/skops/card/tests/test_parser.py b/skops/card/tests/test_parser.py index b723c03e..a7650b07 100644 --- a/skops/card/tests/test_parser.py +++ b/skops/card/tests/test_parser.py @@ -1,5 +1,7 @@ import difflib +import json import os +import re from pathlib import Path import numpy as np @@ -8,7 +10,7 @@ from sklearn.linear_model import LinearRegression from skops.card import Card, parse_modelcard -from skops.card._parser import check_pandoc_installed +from skops.card._parser import PandocParser, check_pandoc_installed try: check_pandoc_installed() @@ -44,11 +46,14 @@ def card(fit_model, tmp_path): EXAMPLE_CARDS = [ + # actual model cards from HF hub "bert-base-uncased.md", "clip-vit-large-patch14.md", "gpt2.md", "specter.md", "vit-base-patch32-224-in21k.md", + # not a model card + "toy-example.md", ] @@ -72,11 +77,12 @@ def assert_readme_files_almost_equal(file0, file1, diff): readme1 = f.readlines() sep = "---\n" - idx0, idx1 = readme0[1:].index(sep) + 1, readme1[1:].index(sep) + 1 - meta0, meta1 = readme0[1:idx0], readme1[1:idx1] - readme0, readme1 = readme0[idx0:], readme1[idx1:] - - _assert_meta_equal(meta0, meta1) + # we look for 2nd occurrence, so skip first char to not match 1st occurrence + if sep in readme0[1:]: # only check if metainfo is present + idx0, idx1 = readme0[1:].index(sep) + 1, readme1[1:].index(sep) + 1 + meta0, meta1 = readme0[1:idx0], readme1[1:idx1] + readme0, readme1 = readme0[idx0:], readme1[idx1:] + _assert_meta_equal(meta0, meta1) # exclude trivial case of both being empty assert readme0 @@ -117,3 +123,47 @@ def test_example_model_cards(tmp_path, file_name): parsed_card.save(file1) assert_readme_files_almost_equal(file0, file1, diff) + + +def test_unknown_pandoc_item_raises(): + source = json.dumps( + { + "pandoc-api-version": [1, 22, 2, 1], + "meta": {}, + "blocks": [ + { + "t": "Header", + "c": [1, ["section", [], []], [{"t": "Str", "c": "section"}]], + }, + {"c": "valid", "t": "Str"}, + {"t": "does-not-exist", "c": []}, + {"c": "okay", "t": "Str"}, + ], + } + ) + parser = PandocParser(source) + msg = ( + "The parsed document contains 'does-not-exist', which is not " + "supported yet, please open an issue on GitHub" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + parser.generate() + + +def test_content_without_section_raises(): + source = json.dumps( + { + "pandoc-api-version": [1, 22, 2, 1], + "meta": {}, + "blocks": [ + {"c": "whoops", "t": "Str"}, + ], + } + ) + parser = PandocParser(source) + msg = ( + "Trying to add content but there is no current section, this is probably a " + "bug, please open an issue on GitHub" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + parser.generate() From 08505f92e8945dc6cb0d94a92cdb5c3d6ff10f09 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Thu, 5 Jan 2023 15:29:53 +0100 Subject: [PATCH 33/47] Install pandoc on linux CI to test parser --- .github/workflows/build-test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 16173cf9..106d5ae6 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -61,6 +61,9 @@ jobs: then pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn; else pip install "scikit-learn~=${{ matrix.sklearn_version }}"; fi + if [ ${{ matrix.os }} == "ubuntu-latest" ]; + then wget -q https://github.com/jgm/pandoc/releases/download/2.19.2/pandoc-2.19.2-1-amd64.deb && sudo dpkg -i pandoc-2.19.2-1-amd64.deb; + fi python --version pip --version pip list From d0aa9df1577e05984a0b739ccbc2973d2303ef92 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 17 Jan 2023 16:14:15 +0100 Subject: [PATCH 34/47] Add documentation about parsing model cards --- docs/model_card.rst | 50 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/docs/model_card.rst b/docs/model_card.rst index 7a6c124d..9d4bffa7 100644 --- a/docs/model_card.rst +++ b/docs/model_card.rst @@ -11,6 +11,9 @@ beginning of it, following with the content of the model card in markdown format. The metadata section is used to make models searchable on the Hub, and get the inference API and the widgets on the website working. +Metadata +-------- + The metadata part of the file needs to follow the specifications `here `__. It includes simple attributes of your models such as the task you're solving, @@ -40,6 +43,9 @@ Here's an example of the metadata section of the ``README.md`` file: ``skops`` creates this section of the file for you, and you almost never need to touch it yourself. +Model Card Content +------------------ + The markdown part does not necessarily need to follow any specification in terms of information passed, which gives the user a lot of flexibility. The markdown part of the ``README.md`` file comes with a couple of defaults provided @@ -90,8 +96,8 @@ as well as adding some subsections with plots below that, you can call the }) Furthermore, you can select existing sections (as well as their subsections) -using :meth:`Card.select`, and you can delete sections using -:meth:`Card.delete`: +using :meth:`.Card.select`, and you can delete sections using +:meth:`.Card.delete`: .. code-block:: python @@ -103,3 +109,43 @@ using :meth:`Card.select`, and you can delete sections using To see how you can use the API in ``skops`` to create a model card, please refer to :ref:`sphx_glr_auto_examples_plot_model_card.py`. + +Saving and Loading Model Cards +------------------------------ + +Once you have finished creating and modifying the model card, you can save it +using the :meth:`.Card.save` method: + +.. code-block:: python + + card.save("README.md") + +This renders the content of the model card to markdown format and stores it in +the indicated file. It is now ready to be uploaded to Hugging Face Hub. + +If you have a finished model card but want to load to make some modifications, +you can use the function :func:`skops.card.parse_modelcard`. This function +parses the model card back into a :class:`.Card` instance that you can work on +further: + +.. code-block:: python + + from skops import card + model_card = card.parse_modelcard("README.md") + model_card.add(**{"A new section": "Some new content"}) + model_card.save("README.md") + +When the card is parsed, some minor details of the model card can change, e.g. +if you used different column alignment than the default, this could change, as +well as removing excess empty lines or trailing whitespace. However, the content +itself should be exactly the same. All known deviations are documented in the +`parse_modelcard docs +`_ + +For the parsing part, we rely on `pandoc `_. If you haven't +installed it, please follow `these instructions +`_. The advantage of using pandoc is that +it's a very mature library and that it supports many different document formats. +Therefore, it should be possible to parse model cards even if they use a format +that's not markdown, for instance reStructuredText, org, or asciidoc. For +saving, we only support markdown, though. From 5da3397134f909a4ce3d6bea1429a63492165e09 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 17 Jan 2023 16:40:46 +0100 Subject: [PATCH 35/47] Fix incorrect link in docs --- docs/model_card.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/model_card.rst b/docs/model_card.rst index 9d4bffa7..ccfd6cde 100644 --- a/docs/model_card.rst +++ b/docs/model_card.rst @@ -140,7 +140,7 @@ if you used different column alignment than the default, this could change, as well as removing excess empty lines or trailing whitespace. However, the content itself should be exactly the same. All known deviations are documented in the `parse_modelcard docs -`_ +`_ For the parsing part, we rely on `pandoc `_. If you haven't installed it, please follow `these instructions From 2cb998e756b4d97a10f2f82e7e4dacf7e3471ae6 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 17 Jan 2023 17:03:45 +0100 Subject: [PATCH 36/47] Clean up test_parser.py It contained remnants from an old test that has been removed since then. --- skops/card/tests/test_parser.py | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/skops/card/tests/test_parser.py b/skops/card/tests/test_parser.py index a7650b07..30f84f00 100644 --- a/skops/card/tests/test_parser.py +++ b/skops/card/tests/test_parser.py @@ -4,12 +4,10 @@ import re from pathlib import Path -import numpy as np import pytest import yaml # type: ignore -from sklearn.linear_model import LinearRegression -from skops.card import Card, parse_modelcard +from skops.card import parse_modelcard from skops.card._parser import PandocParser, check_pandoc_installed try: @@ -19,32 +17,6 @@ pytest.skip(reason="These tests require a recent pandoc", allow_module_level=True) -@pytest.fixture -def fit_model(): - X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) - y = np.dot(X, np.array([1, 2])) + 3 - reg = LinearRegression().fit(X, y) - return reg - - -@pytest.fixture -def card(fit_model, tmp_path): - card = Card(fit_model) - - try: - import matplotlib.pyplot as plt - - fig, ax = plt.subplots() - ax.plot([0, 1]) - fig.savefig(tmp_path / "my-throwaway-plot.png") - card.add_plot(**{"My plots/My first plot": "my-throwaway-plot.png"}) - except ImportError: - pass - - card.add_table(**{"A table": {"col0": [0, 1], "col1": [2, 3]}}) - return card - - EXAMPLE_CARDS = [ # actual model cards from HF hub "bert-base-uncased.md", From 1922bc147ae85d06f7effde3e32d4fda63cc78ae Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Wed, 18 Jan 2023 12:24:55 +0100 Subject: [PATCH 37/47] Move changes entry to v0.5 Was added to v0.4 but that version is already released. --- docs/changes.rst | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/changes.rst b/docs/changes.rst index f1aa7739..fc579c53 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -9,6 +9,14 @@ skops Changelog :depth: 1 :local: +v0.5 +---- +- :class:`.card.Card` now allows to add content to existing sections, using a + ``/`` to separate the subsections. E.g. use ``card.add(**{"Existing + section/New section": "content"})`` to add "content" a new subsection called + "New section" to an existing section called "Existing section". :pr:`203` by + `Benjamin Bossan`_. + v0.4 ---- - :func:`.io.dump` and :func:`.io.load` now work with file like objects, @@ -21,11 +29,6 @@ v0.4 :pr:`242` by `Merve Noyan`_. - Persistence now supports bytes and bytearrays, added tests to verify that LightGBM, XGBoost, and CatBoost work now. :pr:`244` by `Benjamin Bossan`_. -- :class:`.card.Card` now allows to add content to existing sections, using a - ``/`` to separate the subsections. E.g. use ``card.add(**{"Existing - section/New section": "content"})`` to add "content" a new subsection called - "New section" to an existing section called "Existing section". :pr:`203` by - `Benjamin Bossan`_. v0.3 ---- From 709e079c6f22dc95bc7267beb267cbd8bc45671c Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 20 Jan 2023 11:46:22 +0100 Subject: [PATCH 38/47] Reviewer comments: wording, period in err msg --- docs/model_card.rst | 2 +- skops/card/_parser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/model_card.rst b/docs/model_card.rst index ccfd6cde..d3b726d4 100644 --- a/docs/model_card.rst +++ b/docs/model_card.rst @@ -148,4 +148,4 @@ installed it, please follow `these instructions it's a very mature library and that it supports many different document formats. Therefore, it should be possible to parse model cards even if they use a format that's not markdown, for instance reStructuredText, org, or asciidoc. For -saving, we only support markdown, though. +saving, we only support markdown for now. diff --git a/skops/card/_parser.py b/skops/card/_parser.py index 8236f6fd..3d00599a 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -75,7 +75,7 @@ def _add_content(self, content: str, section: Section | None) -> None: # not illegal in markdown, but we don't handle it yet. raise ValueError( "Trying to add content but there is no current section, " - "this is probably a bug, please open an issue on GitHub" + "this is probably a bug, please open an issue on GitHub." ) if not section.content: From 08f44c4a04edb7ea261ec91943c3de9b8efcdd33 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 20 Jan 2023 14:34:21 +0100 Subject: [PATCH 39/47] Use packaging Version for version comparison Add packaging as an explicit dependency. Kinda arbitrarily set min version to be 17.0, which was released 5 years ago. --- skops/_min_dependencies.py | 1 + skops/card/_parser.py | 31 ++++++++----------------------- 2 files changed, 9 insertions(+), 23 deletions(-) diff --git a/skops/_min_dependencies.py b/skops/_min_dependencies.py index 35219a12..1ff8b699 100644 --- a/skops/_min_dependencies.py +++ b/skops/_min_dependencies.py @@ -25,6 +25,7 @@ "sphinx-prompt": ("1.3.0", "docs", None), "sphinx-issues": ("1.2.0", "docs", None), "matplotlib": ("3.3", "docs, tests", None), + "packaging": ("17.0", "install", None), "pandas": ("1", "docs, tests", None), # required for persistence tests of external libraries "lightgbm": ("3", "tests", None), diff --git a/skops/card/_parser.py b/skops/card/_parser.py index 3d00599a..517be702 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -12,16 +12,17 @@ import subprocess from pathlib import Path from tempfile import mkdtemp -from typing import Any, Literal, Sequence +from typing import Any, Literal import yaml # type: ignore +from packaging.version import Version from skops.card import Card from skops.card._model_card import Section from ._markup import Markdown, PandocItem -PANDOC_MIN_VERSION = (2, 19, 0) +PANDOC_MIN_VERSION = "2.19.0" class PandocParser: @@ -140,7 +141,7 @@ def generate(self) -> Card: return card -def _get_pandoc_version() -> list[int]: +def _get_pandoc_version() -> str: """Shell out to retrieve the pandoc version Raises @@ -161,29 +162,12 @@ def _get_pandoc_version() -> list[int]: if not version_info.startswith("pandoc "): raise RuntimeError("Could not determine version of pandoc") - _, _, actual_version = version_info.partition(" ") - pandoc_version = [int(v) for v in actual_version.split(".")] + _, _, pandoc_version = version_info.partition(" ") return pandoc_version -def _check_version_greater_equal( - version: Sequence[int], min_version: Sequence[int] -) -> None: - """Very bad version comparison function to ensure that the first version is - >= the second.""" - for v1, v2 in zip(version, min_version): - if v1 > v2: - return - - if v1 < v2: - raise ValueError( - "Pandoc version too low, expected at least " - f"{'.'.join(map(str, min_version))}" - ) - - def check_pandoc_installed( - min_version: Sequence[int] | None = PANDOC_MIN_VERSION, + min_version: str | None = PANDOC_MIN_VERSION, ) -> None: """Check if pandoc is installed on the system @@ -216,7 +200,8 @@ def check_pandoc_installed( if not min_version: return - _check_version_greater_equal(pandoc_version, min_version) + if Version(pandoc_version) < Version(min_version): + raise ValueError("Pandoc version too low, expected at least {min_version}") def _card_with_detached_metainfo(path: str | Path) -> tuple[str | Path, dict[str, Any]]: From 7c416c41534f3f55f3c7cba8123eeac37f8c34a8 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 20 Jan 2023 14:35:55 +0100 Subject: [PATCH 40/47] Install pandoc with apt in CI --- .github/workflows/build-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 106d5ae6..6b340c0b 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -62,7 +62,7 @@ jobs: else pip install "scikit-learn~=${{ matrix.sklearn_version }}"; fi if [ ${{ matrix.os }} == "ubuntu-latest" ]; - then wget -q https://github.com/jgm/pandoc/releases/download/2.19.2/pandoc-2.19.2-1-amd64.deb && sudo dpkg -i pandoc-2.19.2-1-amd64.deb; + then sudo apt install pandoc; fi python --version pip --version From b855e50ff5636ed754606593e8610e64a9bf06b8 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 20 Jan 2023 14:52:35 +0100 Subject: [PATCH 41/47] Investigate pandoc version issue --- .github/workflows/build-test.yml | 2 +- skops/card/_parser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 6b340c0b..31d5f669 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -62,7 +62,7 @@ jobs: else pip install "scikit-learn~=${{ matrix.sklearn_version }}"; fi if [ ${{ matrix.os }} == "ubuntu-latest" ]; - then sudo apt install pandoc; + then sudo apt install pandoc && pandoc --version; fi python --version pip --version diff --git a/skops/card/_parser.py b/skops/card/_parser.py index 517be702..2dad8f45 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -201,7 +201,7 @@ def check_pandoc_installed( return if Version(pandoc_version) < Version(min_version): - raise ValueError("Pandoc version too low, expected at least {min_version}") + raise ValueError(f"Pandoc version too low, expected at least {min_version}") def _card_with_detached_metainfo(path: str | Path) -> tuple[str | Path, dict[str, Any]]: From 2da5ea61db674a09478b32e22b04eaf32a3264c8 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 20 Jan 2023 15:02:19 +0100 Subject: [PATCH 42/47] Lower min required pandoc version to 2.9.0 --- skops/card/_parser.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/skops/card/_parser.py b/skops/card/_parser.py index 2dad8f45..fecdd4d8 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -22,7 +22,7 @@ from ._markup import Markdown, PandocItem -PANDOC_MIN_VERSION = "2.19.0" +PANDOC_MIN_VERSION = "2.9.0" class PandocParser: @@ -201,7 +201,10 @@ def check_pandoc_installed( return if Version(pandoc_version) < Version(min_version): - raise ValueError(f"Pandoc version too low, expected at least {min_version}") + raise ValueError( + f"Pandoc version too low, expected at least {min_version}, " + f"got {pandoc_version} instead." + ) def _card_with_detached_metainfo(path: str | Path) -> tuple[str | Path, dict[str, Any]]: From 6d1f288baf6c5a38ff163560818402e9db425767 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 23 Jan 2023 14:04:30 +0100 Subject: [PATCH 43/47] Revert pandoc install to use gh + deb Apt repo is very outdated. --- .github/workflows/build-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 31d5f669..106d5ae6 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -62,7 +62,7 @@ jobs: else pip install "scikit-learn~=${{ matrix.sklearn_version }}"; fi if [ ${{ matrix.os }} == "ubuntu-latest" ]; - then sudo apt install pandoc && pandoc --version; + then wget -q https://github.com/jgm/pandoc/releases/download/2.19.2/pandoc-2.19.2-1-amd64.deb && sudo dpkg -i pandoc-2.19.2-1-amd64.deb; fi python --version pip --version From fb46a76cd8bcb12a899d0e92ccc41513ce339a85 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 23 Jan 2023 14:32:44 +0100 Subject: [PATCH 44/47] Add Figure type to parsing Figure was added in Pandoc v3.0 and is more complex than Image, but also more powerful. At the moment, we treat Figure just as Image. --- skops/card/_markup.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/skops/card/_markup.py b/skops/card/_markup.py index 8ffa14b6..00801f07 100644 --- a/skops/card/_markup.py +++ b/skops/card/_markup.py @@ -49,6 +49,7 @@ def __init__(self): "Para": self._para, "Header": self._header, "Image": self._image, + "Figure": self._figure, "CodeBlock": self._code_block, "Code": self._code, "Table": self._table, @@ -155,6 +156,19 @@ def _image(self, value) -> str: content = f"![{caption}]({dest})" return content + def _figure(self, value) -> str: + # Figure type was added in Pandoc v3.0 + (ident, classes, keyvals), caption, (body,) = value + + body_type = body["t"] + # we can only deal with plain figures for now + if body_type != "Plain": + raise ValueError(f"Cannot deal with figure of type '{body_type}'") + + plain_fig = body["c"][0]["c"] + plain_fig[2][1] = "fig:" + return self._image(plain_fig) + @staticmethod def _code_block(item: tuple[tuple[int, list[str], list[str]], str]) -> str: # a codeblock consists of: (id, classes, namevals) contents From 30d3e5c403b2e563cadb10af9b6170da47e046d2 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 24 Jan 2023 12:17:30 +0100 Subject: [PATCH 45/47] Add pragmas for lines that are covered Those are things that don't really make sense to be tested. --- skops/card/_markup.py | 8 ++++---- skops/card/_parser.py | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/skops/card/_markup.py b/skops/card/_markup.py index 00801f07..7779a675 100644 --- a/skops/card/_markup.py +++ b/skops/card/_markup.py @@ -144,11 +144,11 @@ def _image(self, value) -> str: (ident, _, keyvals), caption, (dest, typef) = value # it seems like ident and keyvals are not relevant for markdown - if not caption: + if not caption: # pragma: no cover # not sure if this can be reached, just to be safe raise ValueError("Figure missing a caption") - if not typef.startswith("fig:"): + if not typef.startswith("fig:"): # pragma: no cover # not sure if this can be reached, just to be safe raise ValueError(f"Cannot deal with figure of type '{typef}'") @@ -156,7 +156,7 @@ def _image(self, value) -> str: content = f"![{caption}]({dest})" return content - def _figure(self, value) -> str: + def _figure(self, value) -> str: # pragma: no cover # Figure type was added in Pandoc v3.0 (ident, classes, keyvals), caption, (body,) = value @@ -297,7 +297,7 @@ def _quoted(self, item: tuple[dict[str, str], list[PandocItem]]) -> str: type_ = quote_type["t"] try: sym = {"DoubleQuote": '"', "SingleQuote": "'"}[type_] - except KeyError as exc: + except KeyError as exc: # pragma: no cover # can probably not be reached, but let's be sure msg = ( f"The parsed document contains '{type_}', which is not " diff --git a/skops/card/_parser.py b/skops/card/_parser.py index fecdd4d8..fd4dae38 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -83,7 +83,7 @@ def _add_content(self, content: str, section: Section | None) -> None: section.content = content elif isinstance(section.content, str): section.content = section.content + "\n\n" + content - else: + else: # pragma: no cover # TODO: Content is a Formattable, no generic way to modify it -- # should we require each Formattable to have an update method? raise ValueError(f"Could not modify content of {section.content}") @@ -240,7 +240,8 @@ def _card_with_detached_metainfo(path: str | Path) -> tuple[str | Path, dict[str return path, metainfo idx_separator = text.find(sep_end) - if idx_separator < len(sep_start): # separator shouldn't come earlier than this + if idx_separator < len(sep_start): # pragma: no cover + # separator shouldn't come earlier than this return path, metainfo # https://black.readthedocs.io/en/stable/faq.html#why-are-flake8-s-e203-and-w503-violated From de1837ec9ca4e96edba628aacf8577854c6dd5c3 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 24 Jan 2023 12:18:17 +0100 Subject: [PATCH 46/47] Add more unit tests, get to 100% coverage - test error when trying to use markup != markdown - test no version check for pandoc - test min pandoc version too low - test pandoc not installed - test pandoc version cannot be determined --- skops/card/_parser.py | 6 +++-- skops/card/tests/test_parser.py | 39 +++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/skops/card/_parser.py b/skops/card/_parser.py index fd4dae38..86f80861 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -160,7 +160,8 @@ def _get_pandoc_version() -> str: ) version_info = str(proc.stdout.decode("utf-8")).split("\n", 1)[0] if not version_info.startswith("pandoc "): - raise RuntimeError("Could not determine version of pandoc") + # pandoc is installed but version cannot be determined + raise RuntimeError("Could not determine version of pandoc.") _, _, pandoc_version = version_info.partition(" ") return pandoc_version @@ -190,10 +191,11 @@ def check_pandoc_installed( try: pandoc_version = _get_pandoc_version() except FileNotFoundError as exc: + # pandoc is not installed msg = ( "This feature requires the pandoc library to be installed on your system, " "please follow these install instructions: " - "https://pandoc.org/installing.html" + "https://pandoc.org/installing.html." ) raise FileNotFoundError(msg) from exc diff --git a/skops/card/tests/test_parser.py b/skops/card/tests/test_parser.py index 30f84f00..b74486fe 100644 --- a/skops/card/tests/test_parser.py +++ b/skops/card/tests/test_parser.py @@ -3,6 +3,7 @@ import os import re from pathlib import Path +from unittest.mock import Mock, patch import pytest import yaml # type: ignore @@ -139,3 +140,41 @@ def test_content_without_section_raises(): ) with pytest.raises(ValueError, match=re.escape(msg)): parser.generate() + + +def test_unsupported_markup_raises(): + match = re.escape("Markup of type does-not-exist is not supported (yet)") + with pytest.raises(ValueError, match=match): + PandocParser(source="", markup_type="does-not-exist") + + +def test_check_pandoc_installed_no_min_version_works(): + # check that it doesn't raise + check_pandoc_installed(min_version=None) + + +def test_check_pandoc_installed_min_version_too_high_raises(): + match = re.escape("Pandoc version too low, expected at least 999.9.9, got") + with pytest.raises(ValueError, match=match): + check_pandoc_installed(min_version="999.9.9") + + +def test_pandoc_not_installed(): + def raise_filenotfound(*args, **kwargs): + # error raised when trying to run subprocess on non-existing command + raise FileNotFoundError("[Errno 2] No such file or directory: 'pandoc'") + + with patch("subprocess.run", raise_filenotfound): + match = re.escape( + "This feature requires the pandoc library to be installed on your system" + ) + with pytest.raises(FileNotFoundError, match=match): + check_pandoc_installed() + + +def test_pandoc_version_cannot_be_determined(): + mock = Mock() + with patch("subprocess.run", mock): + match = re.escape("Could not determine version of pandoc") + with pytest.raises(RuntimeError, match=match): + check_pandoc_installed() From 6472ebfbd747c24b549f18c60ae7c4fc3ceac8ef Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 24 Jan 2023 13:18:35 +0100 Subject: [PATCH 47/47] Fix incorrect docstring --- skops/card/_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skops/card/_parser.py b/skops/card/_parser.py index 86f80861..86d4f120 100644 --- a/skops/card/_parser.py +++ b/skops/card/_parser.py @@ -151,7 +151,7 @@ def _get_pandoc_version() -> str: Returns ------- - pandoc_version : list[int] + pandoc_version : str The pandoc version as a list of ints. """ proc = subprocess.run(