From 82268903fe3fbfba856bcdb92d73b5f0e3cdd6b6 Mon Sep 17 00:00:00 2001
From: John McLear <john@mclear.co.uk>
Date: Sun, 29 Mar 2020 12:06:31 +0000
Subject: [PATCH 1/4] ImportHandler: quick & dirty way of being more lax when
 matching <title>

This change is meant to ease using LibreOffice as converter. When LibreOffice
converts a file, it adds some classes to the <title> tag.
This is a quick & dirty way of matching the <title> and comment it out
independently on the classes that are set on it.
---
 src/node/handler/ImportHandler.js | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/src/node/handler/ImportHandler.js b/src/node/handler/ImportHandler.js
index 17b306595c1..3c85b4cc513 100644
--- a/src/node/handler/ImportHandler.js
+++ b/src/node/handler/ImportHandler.js
@@ -181,8 +181,15 @@ async function doImport(req, res, padId)
   if (!req.directDatabaseAccess) {
     text = await fsp_readFile(destFile, "utf8");
 
-    // Title needs to be stripped out else it appends it to the pad..
-    text = text.replace("<title>", "<!-- <title>");
+    /*
+     * The <title> tag needs to be stripped out, otherwise it is appended to the
+     * pad.
+     *
+     * Moreover, when using LibreOffice to convert the file, some classes are
+     * added to the <title> tag. This is a quick & dirty way of matching the
+     * title and comment it out independently on the classes that are set on it.
+     */
+    text = text.replace("<title", "<!-- <title");
     text = text.replace("</title>","</title>-->");
 
     // node on windows has a delay on releasing of the file lock.

From 1aabb0680a0aef69141c2b32ae1e5fa0322a7f1c Mon Sep 17 00:00:00 2001
From: John McLear <john@mclear.co.uk>
Date: Sun, 29 Mar 2020 12:09:33 +0000
Subject: [PATCH 2/4] contentcollector: remove weird stuff LibreOffice adds to
 DOM before importing

---
 src/static/js/contentcollector.js | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/static/js/contentcollector.js b/src/static/js/contentcollector.js
index d3bd7338340..038b235070a 100644
--- a/src/static/js/contentcollector.js
+++ b/src/static/js/contentcollector.js
@@ -526,6 +526,16 @@ function makeContentCollector(collectStyles, abrowser, apool, domInterface, clas
         if (isPre) cc.incrementFlag(state, 'preMode');
         var oldListTypeOrNull = null;
         var oldAuthorOrNull = null;
+
+        // LibreOffice Writer puts in weird items during import or copy/paste, we should drop them.
+        if (cls === "Numbering_20_Symbols" || cls === "Bullet_20_Symbols") {
+          styl = null;
+          cls = null;
+
+          // We have to return here but this could break things in the future, for now it shows how to fix the problem
+          return;
+        }
+
         if (collectStyles)
         {
           hooks.callAll('collectContentPre', {

From 0da1573122681f837dadf1aa735193668e2115c8 Mon Sep 17 00:00:00 2001
From: John McLear <john@mclear.co.uk>
Date: Tue, 7 Apr 2020 10:44:54 +0000
Subject: [PATCH 3/4] LibreOffice: decouple the extension of the temporary file
 from its type

In the next commit, we are going to change the conversion method to
"html:XHTML Writer File:UTF8". Without this change, that conversion method name
would end up in the extension of the temporary file that is created as an
intermediate step. In this way, the file extensione will always stay ".html".

No functional changes, hopefully. Only the extension of the temporary file
should change.
---
 src/node/utils/LibreOffice.js | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/node/utils/LibreOffice.js b/src/node/utils/LibreOffice.js
index d72b9324920..e2c6430b09f 100644
--- a/src/node/utils/LibreOffice.js
+++ b/src/node/utils/LibreOffice.js
@@ -38,6 +38,9 @@ var libreOfficeLogger = log4js.getLogger('LibreOffice');
  * @param  {Function}   callback    Standard callback function
  */
 exports.convertFile = function(srcFile, destFile, type, callback) {
+  // Used for the moving of the file, not the conversion
+  var fileExtension = type;
+
   // soffice can't convert from html to doc directly (verified with LO 5 and 6)
   // we need to convert to odt first, then to doc
   // to avoid `Error: no export filter for /tmp/xxxx.doc` error
@@ -47,11 +50,11 @@ exports.convertFile = function(srcFile, destFile, type, callback) {
       "destFile": destFile.replace(/\.doc$/, '.odt'),
       "type": 'odt',
       "callback": function () {
-        queue.push({"srcFile": srcFile.replace(/\.html$/, '.odt'), "destFile": destFile, "type": type, "callback": callback});
+        queue.push({"srcFile": srcFile.replace(/\.html$/, '.odt'), "destFile": destFile, "type": type, "callback": callback, "fileExtension": fileExtension });
       }
     });
   } else {
-    queue.push({"srcFile": srcFile, "destFile": destFile, "type": type, "callback": callback});
+    queue.push({"srcFile": srcFile, "destFile": destFile, "type": type, "callback": callback, "fileExtension": fileExtension});
   }
 };
 
@@ -102,7 +105,7 @@ function doConvertTask(task, callback) {
     // Move the converted file to the correct place
     function(callback) {
       var filename = path.basename(task.srcFile);
-      var sourceFilename = filename.substr(0, filename.lastIndexOf('.')) + '.' + task.type;
+      var sourceFilename = filename.substr(0, filename.lastIndexOf('.')) + '.' + task.fileExtension;
       var sourcePath = path.join(tmpDir, sourceFilename);
       libreOfficeLogger.debug(`Renaming ${sourcePath} to ${task.destFile}`);
       fs.rename(sourcePath, task.destFile, callback);

From 2b83227610c512732211fca38d0ce996ec1a72e1 Mon Sep 17 00:00:00 2001
From: John McLear <john@mclear.co.uk>
Date: Sun, 29 Mar 2020 12:09:08 +0000
Subject: [PATCH 4/4] LibreOffice: use "html:XHTML Writer File:UTF8" export
 method

This yields better conversion results, but requires the previous change,
otherwise there would have been difficulties in locating the temporary file
name.
---
 src/node/utils/LibreOffice.js | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/node/utils/LibreOffice.js b/src/node/utils/LibreOffice.js
index e2c6430b09f..267f7a0cd0a 100644
--- a/src/node/utils/LibreOffice.js
+++ b/src/node/utils/LibreOffice.js
@@ -41,6 +41,11 @@ exports.convertFile = function(srcFile, destFile, type, callback) {
   // Used for the moving of the file, not the conversion
   var fileExtension = type;
 
+  if (type === "html") {
+    // "html:XHTML Writer File:UTF8" does a better job than normal html exports
+    type = "html:XHTML Writer File:UTF8";
+  }
+
   // soffice can't convert from html to doc directly (verified with LO 5 and 6)
   // we need to convert to odt first, then to doc
   // to avoid `Error: no export filter for /tmp/xxxx.doc` error