From c7e340a78ac8c530b17a3b7f79c7872b850cfdfb Mon Sep 17 00:00:00 2001
From: Koji Ishii <kojii@chromium.org>
Date: Tue, 15 Aug 2023 17:58:56 +0900
Subject: [PATCH] Support non-breaking content in java

This patch supports non-breaking content in Java.

In Java and Python implementations, the "Skip" operation includes the
skipped content to the BudouX parser, so no changes to the text for the
parser is needed.

This patch changes following items:
1. Add `NOBR` to the "skip" element.
2. Fix "skip" is applied only to its descendants. Before this patch, all
   content following "skip" elements are skipped.
3. When there's a phrase boundary right before the "skip" element,
   insert a break before the "skip" element.
---
 budoux/skip_nodes.json                          |  1 +
 .../java/com/google/budoux/HTMLProcessor.java   | 14 ++++++++++++--
 .../com/google/budoux/HTMLProcessorTest.java    | 17 ++++++++++++++---
 3 files changed, 27 insertions(+), 5 deletions(-)
diff --git a/budoux/skip_nodes.json b/budoux/skip_nodes.json
index e42224f0..782f51d9 100644
--- a/budoux/skip_nodes.json
+++ b/budoux/skip_nodes.json
@@ -5,6 +5,7 @@
   "IFRAME",
   "INPUT",
   "META",
+  "NOBR",
   "SCRIPT",
   "STYLE",
   "TEXTAREA",
diff --git a/java/src/main/java/com/google/budoux/HTMLProcessor.java b/java/src/main/java/com/google/budoux/HTMLProcessor.java
index 6818e263..9a8fca27 100644
--- a/java/src/main/java/com/google/budoux/HTMLProcessor.java
+++ b/java/src/main/java/com/google/budoux/HTMLProcessor.java
@@ -29,6 +29,7 @@
 import java.util.List;
 import java.util.Locale;
 import java.util.Set;
+import java.util.Stack;
 import java.util.stream.Collectors;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
@@ -61,6 +62,7 @@ private static class PhraseResolvingNodeVisitor implements NodeVisitor {
     private final StringBuilder output = new StringBuilder();
     private Integer scanIndex = 0;
     private boolean toSkip = false;
+    private Stack<Boolean> elementStack = new Stack<Boolean>();
 
     PhraseResolvingNodeVisitor(List<String> phrases) {
       this.phrasesJoined = String.join(Character.toString(SEP), phrases);
@@ -76,14 +78,20 @@ public void head(Node node, int depth) {
         return;
       }
       if (node instanceof Element) {
+        elementStack.push(toSkip);
         String attributesEncoded =
             node.attributes().asList().stream()
                 .map(attribute -> " " + attribute)
                 .collect(Collectors.joining(""));
-        output.append(String.format("<%s%s>", node.nodeName(), attributesEncoded));
-        if (skipNodes.contains(node.nodeName().toUpperCase(Locale.ENGLISH))) {
+        final String nodeName = node.nodeName();
+        if (skipNodes.contains(nodeName.toUpperCase(Locale.ENGLISH))) {
+          if (!toSkip && phrasesJoined.charAt(scanIndex) == SEP) {
+            output.append("<wbr>");
+            scanIndex++;
+          }
           toSkip = true;
         }
+        output.append(String.format("<%s%s>", nodeName, attributesEncoded));
       } else if (node instanceof TextNode) {
         String data = ((TextNode) node).getWholeText();
         for (int i = 0; i < data.length(); i++) {
@@ -105,6 +113,8 @@ public void tail(Node node, int depth) {
       if (node.nodeName().equals("body") || node instanceof TextNode) {
         return;
       }
+      assert node instanceof Element;
+      toSkip = elementStack.pop();
       output.append(String.format("</%s>", node.nodeName()));
     }
   }
diff --git a/java/src/test/java/com/google/budoux/HTMLProcessorTest.java b/java/src/test/java/com/google/budoux/HTMLProcessorTest.java
index 202e630d..65f55055 100644
--- a/java/src/test/java/com/google/budoux/HTMLProcessorTest.java
+++ b/java/src/test/java/com/google/budoux/HTMLProcessorTest.java
@@ -51,12 +51,23 @@ public void testResolveWithStandardHTMLInput() {
 
   @Test
   public void testResolveWithNodesToSkip() {
-    List<String> phrases = Arrays.asList("abc", "def");
-    String html = "a<button>bcde</button>f";
+    List<String> phrases = Arrays.asList("abc", "def", "ghi");
+    String html = "a<button>bcde</button>fghi";
+    String result = HTMLProcessor.resolve(phrases, html);
+    assertEquals(
+        "<span style=\"word-break: keep-all; overflow-wrap:"
+            + " anywhere;\">a<button>bcde</button>f<wbr>ghi</span>",
+        result);
+  }
+
+  @Test
+  public void testResolveWithNodesBreakBeforeSkip() {
+    List<String> phrases = Arrays.asList("abc", "def", "ghi", "jkl");
+    String html = "abc<nobr>defghi</nobr>jkl";
     String result = HTMLProcessor.resolve(phrases, html);
     assertEquals(
         "<span style=\"word-break: keep-all; overflow-wrap:"
-            + " anywhere;\">a<button>bcde</button>f</span>",
+            + " anywhere;\">abc<wbr><nobr>defghi</nobr><wbr>jkl</span>",
         result);
   }