diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3c3629e --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +node_modules diff --git a/README.md b/README.md index 09213ba..9921464 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ JavaScript port of [TextStatistics.php](https://github.com/DaveChild/Text-Statis I've done what I think is a reasonably faithful port. Documentation incoming! I removed a lot of the original comments during the port, but seeing as the API remained largely the same, I'll add them in shortly. -Same goes for a test suite - I'll get something working in node in a bit. :) +The beginning of a test suite in [Mocha](https://mochajs.org/) is here, covering cleaning the text and some cases of word and sentence counting. ## Installation diff --git a/index.js b/index.js index e2c1059..fd1fb2d 100644 --- a/index.js +++ b/index.js @@ -12,21 +12,23 @@ fullStopTags.forEach(function(tag) { text = text.replace("","."); - }) + }); text = text .replace(/<[^>]+>/g, "") // Strip tags - .replace(/[,:;()\-]/, " ") // Replace commans, hyphens etc (count them as spaces) - .replace(/[\.!?]/, ".") // Unify terminators - .replace(/^\s+/,"") // Strip leading whitespace - .replace(/[ ]*(\n|\r\n|\r)[ ]*/," ") // Replace new lines with spaces - .replace(/([\.])[\. ]+/,".") // Check for duplicated terminators - .replace(/[ ]*([\.])/,". ") // Pad sentence terminators - .replace(/\s+/," ") // Remove multiple spaces - .replace(/\s+$/,""); // Strip trailing whitespace + .replace(/[,:;()\/&+]|\-\-/g, " ") // Replace commas, hyphens etc (count them as spaces) + .replace(/[\.!?]/g, ".") // Unify terminators + .replace(/^\s+/, "") // Strip leading whitespace + .replace(/[\.]?(\w+)[\.]?(\w+)@(\w+)[\.](\w+)[\.]?/g, "$1$2@$3$4") // strip periods in email addresses (so they remain counted as one word) + .replace(/[ ]*(\n|\r\n|\r)[ ]*/g, ".") // Replace new lines with periods + .replace(/([\.])[\.]+/g, ".") // Check for duplicated terminators + .replace(/[ ]*([\.])/g, ". ") // Pad sentence terminators + .replace(/\s+/g, " ") // Remove multiple spaces + .replace(/\s+$/, ""); // Strip trailing whitespace - text += "."; // Add final terminator, just in case it's missing. - + if(text.slice(-1) != '.') { + text += "."; // Add final terminator, just in case it's missing. + } return text; } @@ -84,7 +86,7 @@ TextStatistics.prototype.wordCount = function(text) { text = text ? cleanText(text) : this.text; - return text.split(/[^a-z0-9]+/i).length || 1; + return text.split(/[^a-z0-9\'@\.\-]+/i).length || 1; }; TextStatistics.prototype.averageWordsPerSentence = function(text) { @@ -210,7 +212,7 @@ wordPartCount = word .split(/[^aeiouy]+/ig) .filter(function(wordPart) { - return !!wordPart.replace(/\s+/ig,"").length + return !!wordPart.replace(/\s+/ig,"").length; }) .length; diff --git a/package.json b/package.json index 9117cf5..56a6c24 100644 --- a/package.json +++ b/package.json @@ -10,6 +10,8 @@ }, "main": "index.js", "dependencies": {}, - "devDependencies": {}, + "devDependencies": { + "mocha": "^3.0.2" + }, "optionalDependencies": {} } diff --git a/test/mocha.opts b/test/mocha.opts new file mode 100644 index 0000000..d9a95e8 --- /dev/null +++ b/test/mocha.opts @@ -0,0 +1 @@ +--reporter nyan diff --git a/test/testCleanText.js b/test/testCleanText.js new file mode 100644 index 0000000..cc18442 --- /dev/null +++ b/test/testCleanText.js @@ -0,0 +1,181 @@ +var assert = require('assert'); +var TextStatistics = require('../index.js'); + +describe('TextStatistics', function() { + // this is called when you "make" a TextStatistics + describe('#cleanText()', function() { + it('should add a final terminator if it\'s missing', function() { + var ts = TextStatistics('Hello friend'); + assert.equal(ts.text, 'Hello friend.'); + }); + + it('should not add a final terminator if there is a \'.\'', function() { + var ts = TextStatistics('Hello friend.'); + assert.equal(ts.text, 'Hello friend.'); + }); + + context('trailing whitespace', function() { + it('should strip spaces', function() { + var ts = TextStatistics('Hello friend. '); + assert.equal(ts.text, 'Hello friend.'); + }); + + it('should strip newlines', function() { + var ts = TextStatistics('Hello friend.\n\n'); + assert.equal(ts.text, 'Hello friend.'); + }); + + it('should strip \\r\\n thing', function() { + var ts = TextStatistics('Hello friend.\r\n'); + assert.equal(ts.text, 'Hello friend.'); + }); + + it('should strip tabs', function() { + var ts = TextStatistics('Hello friend.\t'); + assert.equal(ts.text, 'Hello friend.'); + }); + }); + + context('leading whitespace', function() { + it('should strip spaces', function() { + var ts = TextStatistics(' Hello friend.'); + assert.equal(ts.text, 'Hello friend.'); + }); + + it('should strip newlines', function() { + var ts = TextStatistics('\n\nHello friend.'); + assert.equal(ts.text, 'Hello friend.'); + }); + + it('should strip \\r\\n thing', function() { + var ts = TextStatistics('\r\nHello friend.'); + assert.equal(ts.text, 'Hello friend.'); + }); + + it('should strip tabs', function() { + var ts = TextStatistics('\tHello friend.'); + assert.equal(ts.text, 'Hello friend.'); + }); + }); + + it('should remove multiple spaces between words', function() { + var ts = TextStatistics('Hello good friend.'); + assert.equal(ts.text, 'Hello good friend.'); + }); + + it('should un-duplicate terminators', function() { + var ts = TextStatistics('Hello... Friend..'); + assert.equal(ts.text, 'Hello. Friend.'); + }); + + it('should pad terminators with a space', function() { + var ts = TextStatistics('Hello.Good.Friend.'); + assert.equal(ts.text, 'Hello. Good. Friend.'); + }); + + context('unify terminators', function() { + it('should replace all !! with ..', function() { + var ts = TextStatistics('Hello! Friend!'); + assert.equal(ts.text, 'Hello. Friend.'); + }); + + it('should replace all ?? with ..', function() { + var ts = TextStatistics('Hello? Friend?'); + assert.equal(ts.text, 'Hello. Friend.'); + }); + }); + + context('replacing newlines with terminators', function() { + it('should replace \\n', function() { + var ts = TextStatistics('bulleted list here we go\nnice dog\ngood dog'); + assert.equal(ts.text, 'bulleted list here we go. nice dog. good dog.'); + }); + + it('should replace \\r\\n', function() { + var ts = TextStatistics('bulleted list here we go\r\nnice dog\r\ngood dog'); + assert.equal(ts.text, 'bulleted list here we go. nice dog. good dog.'); + }); + + it('should replace \\r', function() { + var ts = TextStatistics('bulleted list here we go\rnice dog\rgood dog'); + assert.equal(ts.text, 'bulleted list here we go. nice dog. good dog.'); + }); + }); + + context('stripping periods from email addresses', function() { + it('should replace a single period', function() { + var ts = TextStatistics('textstatistics@example.com'); + assert.equal(ts.text, 'textstatistics@examplecom.'); + }); + + it('should replace a single period in the first part', function() { + var ts = TextStatistics('text.statistics@example.com'); + assert.equal(ts.text, 'textstatistics@examplecom.'); + }); + + it('should replace two periods in the first part', function() { + var ts = TextStatistics('text.stat.istics@example.com'); + assert.equal(ts.text, 'textstatistics@examplecom.'); + }); + + it('should replace periods with a subdomain', function() { + var ts = TextStatistics('textstatistics@test.example.com'); + assert.equal(ts.text, 'textstatistics@testexamplecom.'); + }); + + it('should replace periods with a subdomain and before the @', function() { + var ts = TextStatistics('text.stat.istics@test.example.com'); + assert.equal(ts.text, 'textstatistics@testexamplecom.'); + }); + }); + + context('replacing non-terminator punctuation', function() { + it('should replace commas with spaces', function() { + var ts = TextStatistics('Hello, hi, friend.'); + assert.equal(ts.text, 'Hello hi friend.'); + }); + + it('should replace colons with spaces', function() { + var ts = TextStatistics('Hello: hi: friend.'); + assert.equal(ts.text, 'Hello hi friend.'); + }); + + it('should replace semicolons with spaces', function() { + var ts = TextStatistics('Hello; hi; friend.'); + assert.equal(ts.text, 'Hello hi friend.'); + }); + + it('should replace parentheses with spaces', function() { + var ts = TextStatistics('(Hello (hi) friend).'); + assert.equal(ts.text, 'Hello hi friend.'); + }); + + it('should replace slashes with spaces', function() { + var ts = TextStatistics('Hello/hi/friend.'); + assert.equal(ts.text, 'Hello hi friend.'); + }); + + it('should replace double hyphens with spaces', function() { + var ts = TextStatistics('Hello--hi--friend.'); + assert.equal(ts.text, 'Hello hi friend.'); + }); + + it('should not replace a single dash with spaces', function() { + var ts = TextStatistics('Hi-di-ho friend-person!'); + assert.equal(ts.text, 'Hi-di-ho friend-person.'); + }); + + it('should replace pluses with spaces', function() { + var ts = TextStatistics('Hello + hi+friend.'); + assert.equal(ts.text, 'Hello hi friend.'); + }); + + it('should replace ampersands with spaces', function() { + var ts = TextStatistics('Hello&hi & friend.'); + assert.equal(ts.text, 'Hello hi friend.'); + }); + + it('should replace em-dash with spaces'); // can I do that? + }); + }); +}); diff --git a/test/testCountMethods.js b/test/testCountMethods.js new file mode 100644 index 0000000..0200c38 --- /dev/null +++ b/test/testCountMethods.js @@ -0,0 +1,54 @@ +var assert = require('assert'); +var TextStatistics = require('../index.js'); + +describe('TextStatistics', function() { + + describe('#sentenceCount()', function() { + it('should count a single sentence', function() { + var ts = TextStatistics('see spot run.'); + assert.equal(1, ts.sentenceCount()); + }); + + it('should count a single sentence with a comma', function() { + var ts = TextStatistics('see, spot runs.'); + assert.equal(1, ts.sentenceCount()); + }); + + it('should count a few simple sentences', function() { + var ts = TextStatistics('see spot run. good job spot. have a treat.'); + assert.equal(3, ts.sentenceCount()); + }); + }); + + describe('#wordCount()', function() { + it('a string w/o words should have word count of one, because dividing by zero', function() { + var ts = TextStatistics('.'); + assert.equal(1, ts.wordCount()); + }); + + it('should count the number of words in a text', function() { + var ts = TextStatistics('see spot run'); + assert.equal(3, ts.wordCount()); + }); + + it('should not count words with an apostrophe as two words', function() { + var ts = TextStatistics('they\'re'); + assert.equal(1, ts.wordCount()); + }); + + it('should not count the empty string after a period as a word', function() { + var ts = TextStatistics('dog.'); + assert.equal(1, ts.wordCount()); + }); + + it('should count an email address as a single word', function() { + var ts = TextStatistics('textstatistics@example.com'); + assert.equal(1, ts.wordCount()); + }); + + it('should count words with a dash as a single word', function() { + var ts = TextStatistics('long-term'); + assert.equal(1, ts.wordCount()); + }); + }); +}); \ No newline at end of file