From 55549a1746cb0c0eb2aa79c9d60c71481d4c5c55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Ma=C4=8Duda?= <lukas.macuda@gmail.com>
Date: Fri, 7 Dec 2018 09:49:38 +0100
Subject: [PATCH 1/2] registrations of the stopwords files outside of the lib
 directory

---
 Readme.md    |  2 +-
 lib/lda.js   | 17 +++++++++++-
 package.json |  2 +-
 test4.js     | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 95 insertions(+), 3 deletions(-)
 create mode 100644 test4.js

diff --git a/Readme.md b/Readme.md
index 6d8e727..9c4538b 100644
--- a/Readme.md
+++ b/Readme.md
@@ -95,7 +95,7 @@ result = lda(documents, 2, 5, ['de']);
 result = lda(documents, 2, 5, ['en', 'de']);
 ```
 
-To add a new language-specific stop-words list, create a file /lda/lib/stopwords_XX.js where XX is the id for the language. For example, a French stop-words list could be named "stopwords_fr.js". The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows:
+To add a new language-specific stop-words list, register a file to the specific language. For example, a French stop-words register the language `lda.registerStopWords('fr', '/path/to/the/french/stopwords.js')`. The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows:
 
 ```javascript
 exports.stop_words = [
diff --git a/lib/lda.js b/lib/lda.js
index 081e854..4d795ab 100644
--- a/lib/lda.js
+++ b/lib/lda.js
@@ -1,5 +1,11 @@
 var stem = require('stem-porter');
 
+var STOP_WORDS_MAP = {
+    en: './stopwords_en.js',
+    de: './stopwords_de.js',
+    es: './stopwords_es.js',
+};
+
 //
 // Based on javascript implementation https://github.com/awaisathar/lda.js
 // Original code based on http://www.arbylon.net/projects/LdaGibbsSampler.java
@@ -22,7 +28,11 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag
       var stopwords = new Array();
 
       languages.forEach(function(value) {
-          var stopwordsLang = require('./stopwords_' + value + ".js");
+          var path = STOP_WORDS_MAP[value];
+          if (!path) {
+              return;
+          }
+          var stopwordsLang = require(path);
           stopwords = stopwords.concat(stopwordsLang.stop_words);
       });
 
@@ -99,6 +109,11 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag
     return result;
 }
 
+process.registerStopwords = function(language, path) {
+    STOP_WORDS_MAP[language] = path;
+    return this;
+};
+
 function makeArray(x) {
     var a = new Array();    
     for (var i=0;i<x;i++)  {
diff --git a/package.json b/package.json
index 4218389..c97b6a1 100644
--- a/package.json
+++ b/package.json
@@ -13,7 +13,7 @@
   },
   "main": "./lib",
   "dependencies": {
-  "stem-porter": "*"
+    "stem-porter": "*"
   },
   "engines": {
     "node": ">= 0.8.x"
diff --git a/test4.js b/test4.js
new file mode 100644
index 0000000..150fc59
--- /dev/null
+++ b/test4.js
@@ -0,0 +1,77 @@
+const lda = require('./lib/lda');
+const path = require('path');
+
+lda.registerStopwords('en_override', path.resolve(__dirname, './lib/stopwords_en.js'));
+
+const collection = [
+  [
+    'Ruby slippers are pretty and fun.',
+    'Long walks in the park are fun.',
+    '',
+    'Slippers are soft on your feet.'
+  ],
+  [
+    'Ruby slippers are pretty and fun.',
+    'Long walks in the park are fun.',
+    null,
+    'Slippers are soft on your feet.'
+  ],
+  [
+    '',
+    'Ruby slippers are pretty and fun.',
+    'Long walks in the park are fun.',
+    'Slippers are soft on your feet.'
+  ],
+  [
+    null,
+    'Ruby slippers are pretty and fun.',
+    'Long walks in the park are fun.',
+    'Slippers are soft on your feet.'
+  ],
+  [
+    'Ruby slippers are pretty and fun.',
+    'Long walks in the park are fun.',
+    'Slippers are soft on your feet.',
+    ''
+  ],
+  [
+    'Ruby slippers are pretty and fun.',
+    'Long walks in the park are fun.',
+    'Slippers are soft on your feet.',
+    null
+  ]
+];
+
+var probabilities = [];
+
+collection.forEach((documents, i) => {
+  const results = lda(documents, 3, 2, ['en_override'], null, null, 123);
+
+  // Save the probabilities for each group. The values should be the same, since we're using the same random seed.
+  const groupProbs = [];
+  results.forEach(group => {
+    group.forEach(row => {
+      groupProbs.push(row.probability);
+    });
+  });
+
+  // Store the entire group in an array.
+  probabilities.push(groupProbs);
+
+  //console.log('\nSet ' + (i + 1));
+  //console.log(results);
+});
+
+var success = true;
+
+// Verify the probabilities for each group are the same, even with empty and null values in the docs.
+probabilities.forEach((group, i) => {
+  if (group[0] !== 0.15 || group[1] !== 0.14 || group[2] !== 0.16 || group[3] !== 0.15 || group[4] !== 0.16 || group[5] !== 0.14) {
+    console.log('Failed expected values for group ' + i);
+    success = false;
+  }
+});
+
+if (success) {
+  console.log('\nResult OK.');
+}
\ No newline at end of file

From b85cd49536bafdde23e67c3f968b37903ea11894 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Ma=C4=8Duda?= <lukas.macuda@gmail.com>
Date: Fri, 7 Dec 2018 09:53:56 +0100
Subject: [PATCH 2/2] update of the readme

---
 Readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Readme.md b/Readme.md
index 9c4538b..1a3f214 100644
--- a/Readme.md
+++ b/Readme.md
@@ -95,7 +95,7 @@ result = lda(documents, 2, 5, ['de']);
 result = lda(documents, 2, 5, ['en', 'de']);
 ```
 
-To add a new language-specific stop-words list, register a file to the specific language. For example, a French stop-words register the language `lda.registerStopWords('fr', '/path/to/the/french/stopwords.js')`. The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows:
+To add a new language-specific stop-words list, register a file for the specific language. For example, a French stop-words register the language `lda.registerStopWords('fr', '/path/to/the/french/stopwords.js')`. The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows:
 
 ```javascript
 exports.stop_words = [