summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn McLear <john@mclear.co.uk>2015-05-18 20:04:15 +0100
committerJohn McLear <john@mclear.co.uk>2015-05-18 20:04:15 +0100
commit5615bab0d9e7cd62867991335dbf176055cc2332 (patch)
tree7364aef30eae2228e19ca2cd9d29f5e5d70d7aec
parent0c4b3f8124147bccd39105b06caf324419783b42 (diff)
parentfd9d0bc291cbfca042f818d76541464c170dc130 (diff)
downloadetherpad-lite-5615bab0d9e7cd62867991335dbf176055cc2332.zip
Merge pull request #2668 from simong/tidy
Tidy HTML before trying to convert it with abiword
-rw-r--r--settings.json.template42
-rw-r--r--src/node/handler/ExportHandler.js36
-rw-r--r--src/node/utils/Settings.js9
-rw-r--r--src/node/utils/TidyHtml.js41
-rw-r--r--tests/backend/specs/api/tidy.js63
5 files changed, 156 insertions, 35 deletions
diff --git a/settings.json.template b/settings.json.template
index 7d9c62cc..310e0791 100644
--- a/settings.json.template
+++ b/settings.json.template
@@ -10,12 +10,12 @@
// favicon default name
// alternatively, set up a fully specified Url to your own favicon
"favicon": "favicon.ico",
-
+
//IP and port which etherpad should bind at
"ip": "0.0.0.0",
"port" : 9001,
- /*
+ /*
// Node native SSL support
// this is disabled by default
//
@@ -37,17 +37,17 @@
"dbSettings" : {
"filename" : "var/dirty.db"
},
-
+
/* An Example of MySQL Configuration
"dbType" : "mysql",
"dbSettings" : {
- "user" : "root",
- "host" : "localhost",
- "password": "",
+ "user" : "root",
+ "host" : "localhost",
+ "password": "",
"database": "store"
},
*/
-
+
//the default text of a pad
"defaultPadText" : "Welcome to Etherpad!\n\nThis pad text is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents!\n\nGet involved with Etherpad at http:\/\/etherpad.org\n",
@@ -65,7 +65,7 @@
"chatAndUsers": false,
"lang": "en-gb"
},
-
+
/* Shoud we suppress errors from being visible in the default Pad Text? */
"suppressErrorsInPadText" : false,
@@ -77,35 +77,39 @@
/* Users, who have a valid session, automatically get granted access to password protected pads */
"sessionNoPassword" : false,
-
- /* if true, all css & js will be minified before sending to the client. This will improve the loading performance massivly,
+
+ /* if true, all css & js will be minified before sending to the client. This will improve the loading performance massivly,
but makes it impossible to debug the javascript/css */
"minify" : true,
/* How long may clients use served javascript code (in seconds)? Without versioning this
may cause problems during deployment. Set to 0 to disable caching */
"maxAge" : 21600, // 60 * 60 * 6 = 6 hours
-
+
/* This is the path to the Abiword executable. Setting it to null, disables abiword.
- Abiword is needed to advanced import/export features of pads*/
+ Abiword is needed to advanced import/export features of pads*/
"abiword" : null,
+ /* This is the path to the Tidy executable. Setting it to null, disables Tidy.
+ Tidy is used to improve the quality of exported pads*/
+ "tidyHtml" : null,
+
/* Allow import of file types other than the supported types: txt, doc, docx, rtf, odt, html & htm */
"allowUnknownFileEnds" : true,
-
+
/* This setting is used if you require authentication of all users.
Note: /admin always requires authentication. */
"requireAuthentication" : false,
/* Require authorization by a module, or a user with is_admin set, see below. */
"requireAuthorization" : false,
-
+
/*when you use NginX or another proxy/ load-balancer set this to true*/
"trustProxy" : false,
-
+
/* Privacy: disable IP logging */
- "disableIPlogging" : false,
-
+ "disableIPlogging" : false,
+
/* Users for basic authentication. is_admin = true gives access to /admin.
If you do not uncomment this, /admin will not be available! */
/*
@@ -126,7 +130,7 @@
// Allow Load Testing tools to hit the Etherpad Instance. Warning this will disable security on the instance.
"loadTest": false,
-
+
/* The toolbar buttons configuration.
"toolbar": {
"left": [
@@ -148,7 +152,7 @@
/* The log level we are using, can be: DEBUG, INFO, WARN, ERROR */
"loglevel": "INFO",
-
+
//Logging configuration. See log4js documentation for further information
// https://github.com/nomiddlename/log4js-node
// You can add as many appenders as you want here:
diff --git a/src/node/handler/ExportHandler.js b/src/node/handler/ExportHandler.js
index f20e8715..f861c82e 100644
--- a/src/node/handler/ExportHandler.js
+++ b/src/node/handler/ExportHandler.js
@@ -28,6 +28,7 @@ var fs = require("fs");
var settings = require('../utils/Settings');
var os = require('os');
var hooks = require("ep_etherpad-lite/static/js/pluginfw/hooks");
+var TidyHtml = require('../utils/TidyHtml');
//load abiword only if its enabled
if(settings.abiword != null)
@@ -35,28 +36,28 @@ if(settings.abiword != null)
var tempDirectory = "/tmp";
-//tempDirectory changes if the operating system is windows
+//tempDirectory changes if the operating system is windows
if(os.type().indexOf("Windows") > -1)
{
tempDirectory = process.env.TEMP;
}
-
+
/**
* do a requested export
- */
+ */
exports.doExport = function(req, res, padId, type)
{
var fileName = padId;
// allow fileName to be overwritten by a hook, the type type is kept static for security reasons
- hooks.aCallFirst("exportFileName", padId,
+ hooks.aCallFirst("exportFileName", padId,
function(err, hookFileName){
// if fileName is set then set it to the padId, note that fileName is returned as an array.
if(hookFileName.length) fileName = hookFileName;
//tell the browser that this is a downloadable file
res.attachment(fileName + "." + type);
-
+
//if this is a plain text export, we can do this directly
// We have to over engineer this because tabs are stored as attributes and not plain text
if(type == "etherpad"){
@@ -72,7 +73,7 @@ exports.doExport = function(req, res, padId, type)
var txt;
var randNum;
var srcFile, destFile;
-
+
async.series([
//render the txt document
function(callback)
@@ -96,7 +97,7 @@ exports.doExport = function(req, res, padId, type)
{
//ensure html can be collected by the garbage collector
txt = null;
-
+
destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type;
abiword.convertFile(srcFile, destFile, type, callback);
},
@@ -140,7 +141,7 @@ exports.doExport = function(req, res, padId, type)
var html;
var randNum;
var srcFile, destFile;
-
+
async.series([
//render the html document
function(callback)
@@ -150,7 +151,7 @@ exports.doExport = function(req, res, padId, type)
if(ERR(err, callback)) return;
html = _html;
callback();
- });
+ });
},
//decide what to do with the html export
function(callback)
@@ -162,22 +163,29 @@ exports.doExport = function(req, res, padId, type)
hooks.aCallFirst("exportHTMLSend", html, function(err, newHTML){
if(newHTML.length) html = newHTML;
res.send(html);
- callback("stop");
+ callback("stop");
});
}
else //write the html export to a file
{
randNum = Math.floor(Math.random()*0xFFFFFFFF);
srcFile = tempDirectory + "/etherpad_export_" + randNum + ".html";
- fs.writeFile(srcFile, html, callback);
+ fs.writeFile(srcFile, html, callback);
}
},
- //send the convert job to abiword
+
+ // Tidy up the exported HTML
function(callback)
{
//ensure html can be collected by the garbage collector
html = null;
-
+
+ TidyHtml.tidy(srcFile, callback);
+ },
+
+ //send the convert job to abiword
+ function(callback)
+ {
destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type;
abiword.convertFile(srcFile, destFile, type, callback);
},
@@ -199,7 +207,7 @@ exports.doExport = function(req, res, padId, type)
//100ms delay to accomidate for slow windows fs
if(os.type().indexOf("Windows") > -1)
{
- setTimeout(function()
+ setTimeout(function()
{
fs.unlink(destFile, callback);
}, 100);
diff --git a/src/node/utils/Settings.js b/src/node/utils/Settings.js
index b7d1f0bc..2cc6a926 100644
--- a/src/node/utils/Settings.js
+++ b/src/node/utils/Settings.js
@@ -153,6 +153,11 @@ exports.minify = true;
exports.abiword = null;
/**
+ * The path of the tidy executable
+ */
+exports.tidyHtml = null;
+
+/**
* Should we support none natively supported file types on import?
*/
exports.allowUnknownFileEnds = true;
@@ -167,7 +172,7 @@ exports.loglevel = "INFO";
*/
exports.disableIPlogging = false;
-/**
+/**
* Disable Load Testing
*/
exports.loadTest = false;
@@ -239,7 +244,7 @@ exports.reloadSettings = function reloadSettings() {
} else {
settingsFilename = path.resolve(path.join(exports.root, settingsFilename));
}
-
+
var settingsStr;
try{
//read the settings sync
diff --git a/src/node/utils/TidyHtml.js b/src/node/utils/TidyHtml.js
new file mode 100644
index 00000000..5d4e6ed7
--- /dev/null
+++ b/src/node/utils/TidyHtml.js
@@ -0,0 +1,41 @@
+/**
+ * Tidy up the HTML in a given file
+ */
+
+var log4js = require('log4js');
+var settings = require('./Settings');
+var spawn = require('child_process').spawn;
+
+exports.tidy = function(srcFile, callback) {
+ var logger = log4js.getLogger('TidyHtml');
+
+ // Don't do anything if Tidy hasn't been enabled
+ if (!settings.tidyHtml) {
+ logger.debug('tidyHtml has not been configured yet, ignoring tidy request');
+ return callback(null);
+ }
+
+ var errMessage = '';
+
+ // Spawn a new tidy instance that cleans up the file inline
+ logger.debug('Tidying ' + srcFile);
+ var tidy = spawn(settings.tidyHtml, ['-modify', srcFile]);
+
+ // Keep track of any error messages
+ tidy.stderr.on('data', function (data) {
+ errMessage += data.toString();
+ });
+
+ // Wait until Tidy is done
+ tidy.on('close', function(code) {
+ // Tidy returns a 0 when no errors occur and a 1 exit code when
+ // the file could be tidied but a few warnings were generated
+ if (code === 0 || code === 1) {
+ logger.debug('Tidied ' + srcFile + ' successfully');
+ return callback(null);
+ } else {
+ logger.error('Failed to tidy ' + srcFile + '\n' + errMessage);
+ return callback('Tidy died with exit code ' + code);
+ }
+ });
+};
diff --git a/tests/backend/specs/api/tidy.js b/tests/backend/specs/api/tidy.js
new file mode 100644
index 00000000..47cb49f6
--- /dev/null
+++ b/tests/backend/specs/api/tidy.js
@@ -0,0 +1,63 @@
+var assert = require('assert')
+ fs = require('fs'),
+ path = require('path'),
+ TidyHtml = null,
+ Settings = null;
+
+var npm = require("../../../../src/node_modules/npm/lib/npm.js");
+
+describe('tidyHtml', function() {
+ before(function(done) {
+ npm.load({}, function(err) {
+ assert.ok(!err);
+ TidyHtml = require('../../../../src/node/utils/TidyHtml');
+ Settings = require('../../../../src/node/utils/Settings');
+ return done()
+ });
+ });
+
+ it('Tidies HTML', function(done) {
+ // If the user hasn't configured Tidy, we skip this tests as it's required for this test
+ if (!Settings.tidyHtml) {
+ this.skip();
+ }
+
+ // Try to tidy up a bad HTML file
+ var tmpDir = process.env.TEMP || "/tmp";
+ var tmpFile = path.join(tmpDir, 'tmp_' + (Math.floor(Math.random() * 1000000)) + '.html')
+ fs.writeFileSync(tmpFile, '<html><body><p>a paragraph</p><li>List without outer UL</li>trailing closing p</p></body></html>');
+ TidyHtml.tidy(tmpFile, function(err){
+ assert.ok(!err);
+
+ // Read the file again
+ var cleanedHtml = fs.readFileSync(tmpFile).toString();
+
+ var expectedHtml = [
+ '<title></title>',
+ '</head>',
+ '<body>',
+ '<p>a paragraph</p>',
+ '<ul>',
+ '<li>List without outer UL</li>',
+ '<li style="list-style: none">trailing closing p</li>',
+ '</ul>',
+ '</body>',
+ '</html>',
+ ].join('\n');
+ assert.notStrictEqual(cleanedHtml.indexOf(expectedHtml), -1);
+ return done();
+ });
+ });
+
+ it('can deal with errors', function(done) {
+ // If the user hasn't configured Tidy, we skip this tests as it's required for this test
+ if (!Settings.tidyHtml) {
+ this.skip();
+ }
+
+ TidyHtml.tidy('/some/none/existing/file.html', function(err) {
+ assert.ok(err);
+ return done();
+ });
+ });
+});