From 55549a1746cb0c0eb2aa79c9d60c71481d4c5c55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Ma=C4=8Duda?= Date: Fri, 7 Dec 2018 09:49:38 +0100 Subject: [PATCH 1/2] registrations of the stopwords files outside of the lib directory --- Readme.md | 2 +- lib/lda.js | 17 +++++++++++- package.json | 2 +- test4.js | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 95 insertions(+), 3 deletions(-) create mode 100644 test4.js diff --git a/Readme.md b/Readme.md index 6d8e727..9c4538b 100644 --- a/Readme.md +++ b/Readme.md @@ -95,7 +95,7 @@ result = lda(documents, 2, 5, ['de']); result = lda(documents, 2, 5, ['en', 'de']); ``` -To add a new language-specific stop-words list, create a file /lda/lib/stopwords_XX.js where XX is the id for the language. For example, a French stop-words list could be named "stopwords_fr.js". The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows: +To add a new language-specific stop-words list, register a file to the specific language. For example, a French stop-words register the language `lda.registerStopWords('fr', '/path/to/the/french/stopwords.js')`. The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows: ```javascript exports.stop_words = [ diff --git a/lib/lda.js b/lib/lda.js index 081e854..4d795ab 100644 --- a/lib/lda.js +++ b/lib/lda.js @@ -1,5 +1,11 @@ var stem = require('stem-porter'); +var STOP_WORDS_MAP = { + en: './stopwords_en.js', + de: './stopwords_de.js', + es: './stopwords_es.js', +}; + // // Based on javascript implementation https://github.com/awaisathar/lda.js // Original code based on http://www.arbylon.net/projects/LdaGibbsSampler.java @@ -22,7 +28,11 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag var stopwords = new Array(); languages.forEach(function(value) { - var stopwordsLang = require('./stopwords_' + value + ".js"); + var path = STOP_WORDS_MAP[value]; + if (!path) { + return; + } + var stopwordsLang = require(path); stopwords = stopwords.concat(stopwordsLang.stop_words); }); @@ -99,6 +109,11 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag return result; } +process.registerStopwords = function(language, path) { + STOP_WORDS_MAP[language] = path; + return this; +}; + function makeArray(x) { var a = new Array(); for (var i=0;i= 0.8.x" diff --git a/test4.js b/test4.js new file mode 100644 index 0000000..150fc59 --- /dev/null +++ b/test4.js @@ -0,0 +1,77 @@ +const lda = require('./lib/lda'); +const path = require('path'); + +lda.registerStopwords('en_override', path.resolve(__dirname, './lib/stopwords_en.js')); + +const collection = [ + [ + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + '', + 'Slippers are soft on your feet.' + ], + [ + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + null, + 'Slippers are soft on your feet.' + ], + [ + '', + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + 'Slippers are soft on your feet.' + ], + [ + null, + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + 'Slippers are soft on your feet.' + ], + [ + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + 'Slippers are soft on your feet.', + '' + ], + [ + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + 'Slippers are soft on your feet.', + null + ] +]; + +var probabilities = []; + +collection.forEach((documents, i) => { + const results = lda(documents, 3, 2, ['en_override'], null, null, 123); + + // Save the probabilities for each group. The values should be the same, since we're using the same random seed. + const groupProbs = []; + results.forEach(group => { + group.forEach(row => { + groupProbs.push(row.probability); + }); + }); + + // Store the entire group in an array. + probabilities.push(groupProbs); + + //console.log('\nSet ' + (i + 1)); + //console.log(results); +}); + +var success = true; + +// Verify the probabilities for each group are the same, even with empty and null values in the docs. +probabilities.forEach((group, i) => { + if (group[0] !== 0.15 || group[1] !== 0.14 || group[2] !== 0.16 || group[3] !== 0.15 || group[4] !== 0.16 || group[5] !== 0.14) { + console.log('Failed expected values for group ' + i); + success = false; + } +}); + +if (success) { + console.log('\nResult OK.'); +} \ No newline at end of file From b85cd49536bafdde23e67c3f968b37903ea11894 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Ma=C4=8Duda?= Date: Fri, 7 Dec 2018 09:53:56 +0100 Subject: [PATCH 2/2] update of the readme --- Readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Readme.md b/Readme.md index 9c4538b..1a3f214 100644 --- a/Readme.md +++ b/Readme.md @@ -95,7 +95,7 @@ result = lda(documents, 2, 5, ['de']); result = lda(documents, 2, 5, ['en', 'de']); ``` -To add a new language-specific stop-words list, register a file to the specific language. For example, a French stop-words register the language `lda.registerStopWords('fr', '/path/to/the/french/stopwords.js')`. The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows: +To add a new language-specific stop-words list, register a file for the specific language. For example, a French stop-words register the language `lda.registerStopWords('fr', '/path/to/the/french/stopwords.js')`. The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows: ```javascript exports.stop_words = [