From 82268903fe3fbfba856bcdb92d73b5f0e3cdd6b6 Mon Sep 17 00:00:00 2001 From: John McLear Date: Sun, 29 Mar 2020 12:06:31 +0000 Subject: [PATCH 1/4] ImportHandler: quick & dirty way of being more lax when matching This change is meant to ease using LibreOffice as converter. When LibreOffice converts a file, it adds some classes to the <title> tag. This is a quick & dirty way of matching the <title> and comment it out independently on the classes that are set on it. --- src/node/handler/ImportHandler.js | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/node/handler/ImportHandler.js b/src/node/handler/ImportHandler.js index 17b306595c1..3c85b4cc513 100644 --- a/src/node/handler/ImportHandler.js +++ b/src/node/handler/ImportHandler.js @@ -181,8 +181,15 @@ async function doImport(req, res, padId) if (!req.directDatabaseAccess) { text = await fsp_readFile(destFile, "utf8"); - // Title needs to be stripped out else it appends it to the pad.. - text = text.replace("<title>", "<!-- <title>"); + /* + * The <title> tag needs to be stripped out, otherwise it is appended to the + * pad. + * + * Moreover, when using LibreOffice to convert the file, some classes are + * added to the <title> tag. This is a quick & dirty way of matching the + * title and comment it out independently on the classes that are set on it. + */ + text = text.replace("<title", "<!-- <title"); text = text.replace("","-->"); // node on windows has a delay on releasing of the file lock. From 1aabb0680a0aef69141c2b32ae1e5fa0322a7f1c Mon Sep 17 00:00:00 2001 From: John McLear Date: Sun, 29 Mar 2020 12:09:33 +0000 Subject: [PATCH 2/4] contentcollector: remove weird stuff LibreOffice adds to DOM before importing --- src/static/js/contentcollector.js | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/static/js/contentcollector.js b/src/static/js/contentcollector.js index d3bd7338340..038b235070a 100644 --- a/src/static/js/contentcollector.js +++ b/src/static/js/contentcollector.js @@ -526,6 +526,16 @@ function makeContentCollector(collectStyles, abrowser, apool, domInterface, clas if (isPre) cc.incrementFlag(state, 'preMode'); var oldListTypeOrNull = null; var oldAuthorOrNull = null; + + // LibreOffice Writer puts in weird items during import or copy/paste, we should drop them. + if (cls === "Numbering_20_Symbols" || cls === "Bullet_20_Symbols") { + styl = null; + cls = null; + + // We have to return here but this could break things in the future, for now it shows how to fix the problem + return; + } + if (collectStyles) { hooks.callAll('collectContentPre', { From 0da1573122681f837dadf1aa735193668e2115c8 Mon Sep 17 00:00:00 2001 From: John McLear Date: Tue, 7 Apr 2020 10:44:54 +0000 Subject: [PATCH 3/4] LibreOffice: decouple the extension of the temporary file from its type In the next commit, we are going to change the conversion method to "html:XHTML Writer File:UTF8". Without this change, that conversion method name would end up in the extension of the temporary file that is created as an intermediate step. In this way, the file extensione will always stay ".html". No functional changes, hopefully. Only the extension of the temporary file should change. --- src/node/utils/LibreOffice.js | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/node/utils/LibreOffice.js b/src/node/utils/LibreOffice.js index d72b9324920..e2c6430b09f 100644 --- a/src/node/utils/LibreOffice.js +++ b/src/node/utils/LibreOffice.js @@ -38,6 +38,9 @@ var libreOfficeLogger = log4js.getLogger('LibreOffice'); * @param {Function} callback Standard callback function */ exports.convertFile = function(srcFile, destFile, type, callback) { + // Used for the moving of the file, not the conversion + var fileExtension = type; + // soffice can't convert from html to doc directly (verified with LO 5 and 6) // we need to convert to odt first, then to doc // to avoid `Error: no export filter for /tmp/xxxx.doc` error @@ -47,11 +50,11 @@ exports.convertFile = function(srcFile, destFile, type, callback) { "destFile": destFile.replace(/\.doc$/, '.odt'), "type": 'odt', "callback": function () { - queue.push({"srcFile": srcFile.replace(/\.html$/, '.odt'), "destFile": destFile, "type": type, "callback": callback}); + queue.push({"srcFile": srcFile.replace(/\.html$/, '.odt'), "destFile": destFile, "type": type, "callback": callback, "fileExtension": fileExtension }); } }); } else { - queue.push({"srcFile": srcFile, "destFile": destFile, "type": type, "callback": callback}); + queue.push({"srcFile": srcFile, "destFile": destFile, "type": type, "callback": callback, "fileExtension": fileExtension}); } }; @@ -102,7 +105,7 @@ function doConvertTask(task, callback) { // Move the converted file to the correct place function(callback) { var filename = path.basename(task.srcFile); - var sourceFilename = filename.substr(0, filename.lastIndexOf('.')) + '.' + task.type; + var sourceFilename = filename.substr(0, filename.lastIndexOf('.')) + '.' + task.fileExtension; var sourcePath = path.join(tmpDir, sourceFilename); libreOfficeLogger.debug(`Renaming ${sourcePath} to ${task.destFile}`); fs.rename(sourcePath, task.destFile, callback); From 2b83227610c512732211fca38d0ce996ec1a72e1 Mon Sep 17 00:00:00 2001 From: John McLear Date: Sun, 29 Mar 2020 12:09:08 +0000 Subject: [PATCH 4/4] LibreOffice: use "html:XHTML Writer File:UTF8" export method This yields better conversion results, but requires the previous change, otherwise there would have been difficulties in locating the temporary file name. --- src/node/utils/LibreOffice.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/node/utils/LibreOffice.js b/src/node/utils/LibreOffice.js index e2c6430b09f..267f7a0cd0a 100644 --- a/src/node/utils/LibreOffice.js +++ b/src/node/utils/LibreOffice.js @@ -41,6 +41,11 @@ exports.convertFile = function(srcFile, destFile, type, callback) { // Used for the moving of the file, not the conversion var fileExtension = type; + if (type === "html") { + // "html:XHTML Writer File:UTF8" does a better job than normal html exports + type = "html:XHTML Writer File:UTF8"; + } + // soffice can't convert from html to doc directly (verified with LO 5 and 6) // we need to convert to odt first, then to doc // to avoid `Error: no export filter for /tmp/xxxx.doc` error