From c7e340a78ac8c530b17a3b7f79c7872b850cfdfb Mon Sep 17 00:00:00 2001 From: Koji Ishii Date: Tue, 15 Aug 2023 17:58:56 +0900 Subject: [PATCH] Support non-breaking content in java This patch supports non-breaking content in Java. In Java and Python implementations, the "Skip" operation includes the skipped content to the BudouX parser, so no changes to the text for the parser is needed. This patch changes following items: 1. Add `NOBR` to the "skip" element. 2. Fix "skip" is applied only to its descendants. Before this patch, all content following "skip" elements are skipped. 3. When there's a phrase boundary right before the "skip" element, insert a break before the "skip" element. --- budoux/skip_nodes.json | 1 + .../java/com/google/budoux/HTMLProcessor.java | 14 ++++++++++++-- .../com/google/budoux/HTMLProcessorTest.java | 17 ++++++++++++++--- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/budoux/skip_nodes.json b/budoux/skip_nodes.json index e42224f0..782f51d9 100644 --- a/budoux/skip_nodes.json +++ b/budoux/skip_nodes.json @@ -5,6 +5,7 @@ "IFRAME", "INPUT", "META", + "NOBR", "SCRIPT", "STYLE", "TEXTAREA", diff --git a/java/src/main/java/com/google/budoux/HTMLProcessor.java b/java/src/main/java/com/google/budoux/HTMLProcessor.java index 6818e263..9a8fca27 100644 --- a/java/src/main/java/com/google/budoux/HTMLProcessor.java +++ b/java/src/main/java/com/google/budoux/HTMLProcessor.java @@ -29,6 +29,7 @@ import java.util.List; import java.util.Locale; import java.util.Set; +import java.util.Stack; import java.util.stream.Collectors; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -61,6 +62,7 @@ private static class PhraseResolvingNodeVisitor implements NodeVisitor { private final StringBuilder output = new StringBuilder(); private Integer scanIndex = 0; private boolean toSkip = false; + private Stack elementStack = new Stack(); PhraseResolvingNodeVisitor(List phrases) { this.phrasesJoined = String.join(Character.toString(SEP), phrases); @@ -76,14 +78,20 @@ public void head(Node node, int depth) { return; } if (node instanceof Element) { + elementStack.push(toSkip); String attributesEncoded = node.attributes().asList().stream() .map(attribute -> " " + attribute) .collect(Collectors.joining("")); - output.append(String.format("<%s%s>", node.nodeName(), attributesEncoded)); - if (skipNodes.contains(node.nodeName().toUpperCase(Locale.ENGLISH))) { + final String nodeName = node.nodeName(); + if (skipNodes.contains(nodeName.toUpperCase(Locale.ENGLISH))) { + if (!toSkip && phrasesJoined.charAt(scanIndex) == SEP) { + output.append(""); + scanIndex++; + } toSkip = true; } + output.append(String.format("<%s%s>", nodeName, attributesEncoded)); } else if (node instanceof TextNode) { String data = ((TextNode) node).getWholeText(); for (int i = 0; i < data.length(); i++) { @@ -105,6 +113,8 @@ public void tail(Node node, int depth) { if (node.nodeName().equals("body") || node instanceof TextNode) { return; } + assert node instanceof Element; + toSkip = elementStack.pop(); output.append(String.format("", node.nodeName())); } } diff --git a/java/src/test/java/com/google/budoux/HTMLProcessorTest.java b/java/src/test/java/com/google/budoux/HTMLProcessorTest.java index 202e630d..65f55055 100644 --- a/java/src/test/java/com/google/budoux/HTMLProcessorTest.java +++ b/java/src/test/java/com/google/budoux/HTMLProcessorTest.java @@ -51,12 +51,23 @@ public void testResolveWithStandardHTMLInput() { @Test public void testResolveWithNodesToSkip() { - List phrases = Arrays.asList("abc", "def"); - String html = "af"; + List phrases = Arrays.asList("abc", "def", "ghi"); + String html = "afghi"; + String result = HTMLProcessor.resolve(phrases, html); + assertEquals( + "afghi", + result); + } + + @Test + public void testResolveWithNodesBreakBeforeSkip() { + List phrases = Arrays.asList("abc", "def", "ghi", "jkl"); + String html = "abcdefghijkl"; String result = HTMLProcessor.resolve(phrases, html); assertEquals( "af", + + " anywhere;\">abcdefghijkl", result); }