From 572f5f1d60b1ad179a83b998e9b9a82ab4823a7f Mon Sep 17 00:00:00 2001
From: Andrei Kislichenko
Date: Tue, 19 May 2026 17:49:34 -0400
Subject: [PATCH 1/2] Preserve XML-fragment markup in Bibcollection
title/author
Port of fd20c9d (#125) to the v2/lutaml-integration branch.
Switch Bibcollection.from_xml to read the collection title and author
via inner_html instead of Nokogiri's .text, so the in-memory strings
keep their XML-fragment form (markup + entities intact). Apply the
strip_html Liquid filter on the HTML tag position so the
browser tab title stays plain text. Adds find_html to ElementFinder
alongside find_text. Adds a regression spec with markup and & in
both the collection title and the author name.
Refs metanorma/isodoc#785.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
lib/relaton/bibcollection.rb | 4 +-
lib/relaton/element_finder.rb | 4 ++
spec/assets/index-with-markup.xml | 23 +++++++
spec/fixtures/ietf_index.rxl | 62 ++++++++++---------
spec/relaton/cli/xml_to_html_renderer_spec.rb | 23 +++++++
templates/_index.liquid | 2 +-
6 files changed, 85 insertions(+), 33 deletions(-)
create mode 100644 spec/assets/index-with-markup.xml
diff --git a/lib/relaton/bibcollection.rb b/lib/relaton/bibcollection.rb
index bc1e5c5..d30a711 100644
--- a/lib/relaton/bibcollection.rb
+++ b/lib/relaton/bibcollection.rb
@@ -32,8 +32,8 @@ def <<(item)
# @param source [Nokogiri::XML::Element]
def self.from_xml(source)
- title = find_text("./relaton-collection/title", source)
- author = find_text(
+ title = find_html("./relaton-collection/title", source)
+ author = find_html(
"./relaton-collection/contributor[role/@type='author']/organization/"\
"name", source
)
diff --git a/lib/relaton/element_finder.rb b/lib/relaton/element_finder.rb
index cdb1230..84afe40 100644
--- a/lib/relaton/element_finder.rb
+++ b/lib/relaton/element_finder.rb
@@ -6,6 +6,10 @@ def find_text(xpath, element = nil)
find(xpath, element)&.text
end
+ def find_html(xpath, element = nil)
+ find(xpath, element)&.inner_html
+ end
+
def find(xpath, element = nil)
(element || document).at(apply_namespace(xpath))
end
diff --git a/spec/assets/index-with-markup.xml b/spec/assets/index-with-markup.xml
new file mode 100644
index 0000000..c42f9b0
--- /dev/null
+++ b/spec/assets/index-with-markup.xml
@@ -0,0 +1,23 @@
+
+ Use of ActualText & Reference structure elements
+
+
+
+ Acme & Co
+
+
+
+
+ Sample doc title
+ http://example.org/sample.pdf
+ EX 1
+
+ 2026-01-01
+
+ Published
+
+ TC EX
+
+
+
+
diff --git a/spec/fixtures/ietf_index.rxl b/spec/fixtures/ietf_index.rxl
index 1fb1bfa..eca21e8 100644
--- a/spec/fixtures/ietf_index.rxl
+++ b/spec/fixtures/ietf_index.rxl
@@ -1,6 +1,6 @@
Collection titleRibose
- The Holy Hand Grenade of Antioch
- Hand Grenade of Antioch
+ The Holy Hand Grenade of Antioch
+ Hand Grenade of Antioch
spec/fixtures/documents/antioch.xml
spec/fixtures/documents/antioch.rxl
draft-camelot-holy-grenade-01
@@ -23,7 +23,6 @@
-
en
@@ -41,10 +40,10 @@
- The Arte of ASCII: Or, An True and Accurate Representation of an Menagerie of Thynges Fabulous and Wonderful in Ye Forme of Character
+ The Arte of ASCII: Or, An True and Accurate Representation of an Menagerie of Thynges Fabulous and Wonderful in Ye Forme of Character
https://xml2rfc.tools.ietf.org/public/rfc/bibxml/reference.RFC.8140.xml
https://www.rfc-editor.org/info/rfc8140
- RFC 8140
+ RFC 8140
10.17487/RFC8140
2017-04
@@ -72,9 +71,9 @@
en
- Ever since Gutenberg discovered and patented ASCII and the corresponding "Courier New" font with its now-famous "ten" point size, artisans and artificers have striven to represent their views of the world in print.Similarly, starting from Darwin's discovery of the hippogriff and his subsequent registration of the creature as an International Trade Mark, men (and some women) have struggled to catalog the fabulous variety that is called "nature".This document supplies a number of representations of all manner of things (both elemental and hypothetical) supplied by some of our best collectors of curios and delivered in a manner that may well be reused by the cunning document author.
+ Ever since Gutenberg discovered and patented ASCII and the corresponding "Courier New" font with its now-famous "ten" point size, artisans and artificers have striven to represent their views of the world in print.Similarly, starting from Darwin's discovery of the hippogriff and his subsequent registration of the creature as an International Trade Mark, men (and some women) have struggled to catalog the fabulous variety that is called "nature".This document supplies a number of representations of all manner of things (both elemental and hypothetical) supplied by some of our best collectors of curios and delivered in a manner that may well be reused by the cunning document author.
- RFC
+ RFC
8140
@@ -91,8 +90,8 @@
- RFC XML v3 Example: A Standard for the Transmission of IP Datagrams on Avian Carriers
- IP Datagrams on Avian Carriers
+ RFC XML v3 Example: A Standard for the Transmission of IP Datagrams on Avian Carriers
+ IP Datagrams on Avian Carriers
spec/fixtures/documents/example.xml
spec/fixtures/documents/example.html
spec/fixtures/documents/example.rxl
@@ -131,7 +130,6 @@
-
en
@@ -155,10 +153,10 @@
- Request For Comments reference guide
+ Request For Comments reference guide
https://xml2rfc.tools.ietf.org/public/rfc/bibxml/reference.RFC.1000.xml
https://www.rfc-editor.org/info/rfc1000
- RFC 1000
+ RFC 1000
10.17487/RFC1000
1987-08
@@ -200,9 +198,9 @@
en
- This RFC Reference Guide is intended to provide a historical account by categorizing and summarizing of the Request for Comments numbers 1 through 999 issued between the years 1969-1987. These documents have been crossed referenced to indicate which RFCs are current, obsolete, or revised.
+ This RFC Reference Guide is intended to provide a historical account by categorizing and summarizing of the Request for Comments numbers 1 through 999 issued between the years 1969-1987. These documents have been crossed referenced to indicate which RFCs are current, obsolete, or revised.
- RFC
+ RFC
1000
@@ -210,10 +208,10 @@
- IAB official protocol standards
+ IAB official protocol standards
https://xml2rfc.tools.ietf.org/public/rfc/bibxml/reference.RFC.1200.xml
https://www.rfc-editor.org/info/rfc1200
- RFC 1200
+ RFC 1200
10.17487/RFC1200
1991-04
@@ -239,9 +237,9 @@
en
- This memo describes the state of standardization of protocols used in the Internet as determined by the Internet Activities Board (IAB). An overview of the standards procedures is presented first, followed by discussions of the standardization process and the RFC document series, then the explanation of the terms is presented, the lists of protocols in each stage of standardization follows, and finally pointers to references and contacts for further information.
+ This memo describes the state of standardization of protocols used in the Internet as determined by the Internet Activities Board (IAB). An overview of the standards procedures is presented first, followed by discussions of the standardization process and the RFC document series, then the explanation of the terms is presented, the lists of protocols in each stage of standardization follows, and finally pointers to references and contacts for further information.
- RFC
+ RFC
1200
@@ -249,10 +247,10 @@
- Operational Criteria for Root Name Servers
+ Operational Criteria for Root Name Servers
https://xml2rfc.tools.ietf.org/public/rfc/bibxml/reference.RFC.2010.xml
https://www.rfc-editor.org/info/rfc2010
- RFC 2010
+ RFC 2010
10.17487/RFC2010
1996-10
@@ -294,9 +292,9 @@
en
- This document specifies the operational requirements of root name servers, including host hardware capacities, name server software revisions, network connectivity, and physical environment. This memo provides information for the Internet community. This memo does not specify an Internet standard of any kind.
+ This document specifies the operational requirements of root name servers, including host hardware capacities, name server software revisions, network connectivity, and physical environment. This memo provides information for the Internet community. This memo does not specify an Internet standard of any kind.
- RFC
+ RFC
2010
@@ -304,10 +302,10 @@
- Managing the X.500 Root Naming Context
+ Managing the X.500 Root Naming Context
https://xml2rfc.tools.ietf.org/public/rfc/bibxml/reference.RFC.2120.xml
https://www.rfc-editor.org/info/rfc2120
- RFC 2120
+ RFC 2120
10.17487/RFC2120
1997-03
@@ -335,9 +333,9 @@
en
- This document describes the use of 1993 ISO X.500 Standard protocols for managing the root context. Whilst the ASN.1 is compatible with that of the X.500 Standard, the actual settings of the parameters are supplementary to that of the X.500 Standard. This memo defines an Experimental Protocol for the Internet community.
+ This document describes the use of 1993 ISO X.500 Standard protocols for managing the root context. Whilst the ASN.1 is compatible with that of the X.500 Standard, the actual settings of the parameters are supplementary to that of the X.500 Standard. This memo defines an Experimental Protocol for the Internet community.
- RFC
+ RFC
2120
@@ -380,13 +378,15 @@
en
- The menagerie of beasts and artefacts depicted in RFC8140
+
+ The menagerie of beasts and artefacts depicted in RFC8140
may be usefully supplemented by other renowned figures of
Internet and more general lore. This document extends the
menagerie to the seminal fable of the
"Holy Hand Grenade of Antioch", as depicted in the
Monty Python film "Monty Python and the Holy Grail",
-as well as "Spamalot", the musical inspired by the movie.
+as well as "Spamalot", the musical inspired by the movie.
+
Informational
@@ -417,7 +417,8 @@ as well as "Spamalot", the musical inspired by the movie.
en
- Avian carriers can provide high delay, low throughput, and low
+
+ Avian carriers can provide high delay, low throughput, and low
altitude service. The connection topology is limited to a single
point-to-point path for each carrier, used with standard carriers,
but many carriers can be used without significant interference with
@@ -427,7 +428,8 @@ IEEE802.3. The carriers have an intrinsic collision avoidance
system, which increases availability. Unlike some network
technologies, such as packet radio, communication is not limited to
line-of-sight distance. Connection oriented service is available in
-some cities, usually based upon a central hub topology.
+some cities, usually based upon a central hub topology.
+
Published
diff --git a/spec/relaton/cli/xml_to_html_renderer_spec.rb b/spec/relaton/cli/xml_to_html_renderer_spec.rb
index 500a6d4..4ce740b 100644
--- a/spec/relaton/cli/xml_to_html_renderer_spec.rb
+++ b/spec/relaton/cli/xml_to_html_renderer_spec.rb
@@ -31,6 +31,29 @@
end
end
+ context "with markup and entities in the collection title and author" do
+ let(:html) do
+ renderer.render(File.read("spec/assets/index-with-markup.xml"))
+ end
+
+ it "preserves markup and & in the coverpage title" do
+ expect(html).to include(
+ 'Use of ActualText ' \
+ "& Reference structure elements",
+ )
+ end
+
+ it "strips inline tags but keeps & in " do
+ head_title = html[/([^<]*(?:<(?!\/title)[^<]*)*)<\/title>/m, 1]
+ expect(head_title).to include("&")
+ expect(head_title).not_to include("")
+ end
+
+ it "preserves & in the rendered author" do
+ expect(html).to include("Acme & Co")
+ end
+ end
+
context "with a document containing other collections" do
let(:html) do
renderer.render(File.read("spec/assets/with-collections.xml"))
diff --git a/templates/_index.liquid b/templates/_index.liquid
index 4b2c97b..9d36d84 100644
--- a/templates/_index.liquid
+++ b/templates/_index.liquid
@@ -1,7 +1,7 @@
- {{ title }}
+ {{ title | strip_html }}