From a518d9d58a94b673a229f8b72968512c560f25a6 Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Tue, 4 Jun 2024 14:27:43 -0400 Subject: [PATCH] Judge a survey comments and give them a rating (#50891) Co-authored-by: Rachael Sewell --- package-lock.json | 23 +++ package.json | 3 + src/events/analyze-comment.js | 158 +++++++++++++++++++++ src/events/middleware.js | 14 +- src/events/scripts/analyze-comments-csv.ts | 100 +++++++++++++ src/events/tests/analyze-comments.js | 157 ++++++++++++++++++++ src/events/tests/middleware.js | 9 +- 7 files changed, 448 insertions(+), 16 deletions(-) create mode 100644 src/events/analyze-comment.js create mode 100644 src/events/scripts/analyze-comments-csv.ts create mode 100644 src/events/tests/analyze-comments.js diff --git a/package-lock.json b/package-lock.json index abf218c34183..9179b52ccc0a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,6 +10,7 @@ "dependencies": { "@elastic/elasticsearch": "8.13.1", "@github/failbot": "0.8.3", + "@horizon-rs/language-guesser": "0.1.1", "@octokit/plugin-retry": "6.0.1", "@octokit/request-error": "6.1.1", "@primer/behaviors": "^1.5.1", @@ -29,6 +30,7 @@ "connect-datadog": "0.0.9", "connect-timeout": "1.9.0", "cookie-parser": "^1.4.6", + "cuss": "2.2.0", "dayjs": "^1.11.3", "dotenv": "^16.4.5", "escape-string-regexp": "5.0.0", @@ -126,6 +128,7 @@ "commander": "^12.1.0", "cross-env": "^7.0.3", "csp-parse": "0.0.2", + "csv-parse": "5.5.6", "eslint": "8.57.0", "eslint-config-prettier": "9.1.0", "eslint-config-standard": "17.1.0", @@ -1478,6 +1481,11 @@ "@hapi/hoek": "^9.0.0" } }, + "node_modules/@horizon-rs/language-guesser": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/@horizon-rs/language-guesser/-/language-guesser-0.1.1.tgz", + "integrity": "sha512-tWTHkmSlP3QfFlWlsMzUDMeAqs8vXlAf91gCTDPprBwiY4ptbM0YzUUYCoRrFDgAxFVovCxBzlWoDQv7OyimKg==" + }, "node_modules/@humanwhocodes/config-array": { "version": "0.11.14", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.14.tgz", @@ -5204,6 +5212,21 @@ "version": "3.0.9", "license": "MIT" }, + "node_modules/csv-parse": { + "version": "5.5.6", + "resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-5.5.6.tgz", + "integrity": "sha512-uNpm30m/AGSkLxxy7d9yRXpJQFrZzVWLFBkS+6ngPcZkw/5k3L/jjFuj7tVnEpRn+QgmiXr21nDlhCiUK4ij2A==", + "dev": true + }, + "node_modules/cuss": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/cuss/-/cuss-2.2.0.tgz", + "integrity": "sha512-3hlHOhMiZ6YdHY5LPUhfxlx1Pj14eGttv2l9ADB1Lkv7e/us5XD798wrVLJ9DHmDO8SzCDuA+ItByFZ3M1dIYg==", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/damerau-levenshtein": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/damerau-levenshtein/-/damerau-levenshtein-1.0.8.tgz", diff --git a/package.json b/package.json index 47da24c1c227..50868af80c18 100644 --- a/package.json +++ b/package.json @@ -197,6 +197,7 @@ "dependencies": { "@elastic/elasticsearch": "8.13.1", "@github/failbot": "0.8.3", + "@horizon-rs/language-guesser": "0.1.1", "@octokit/plugin-retry": "6.0.1", "@octokit/request-error": "6.1.1", "@primer/behaviors": "^1.5.1", @@ -216,6 +217,7 @@ "connect-datadog": "0.0.9", "connect-timeout": "1.9.0", "cookie-parser": "^1.4.6", + "cuss": "2.2.0", "dayjs": "^1.11.3", "dotenv": "^16.4.5", "escape-string-regexp": "5.0.0", @@ -313,6 +315,7 @@ "commander": "^12.1.0", "cross-env": "^7.0.3", "csp-parse": "0.0.2", + "csv-parse": "5.5.6", "eslint": "8.57.0", "eslint-config-prettier": "9.1.0", "eslint-config-standard": "17.1.0", diff --git a/src/events/analyze-comment.js b/src/events/analyze-comment.js new file mode 100644 index 000000000000..fd0a3faa5279 --- /dev/null +++ b/src/events/analyze-comment.js @@ -0,0 +1,158 @@ +import { cuss } from 'cuss' +import { cuss as cussPt } from 'cuss/pt' +import { cuss as cussFr } from 'cuss/fr' +import { cuss as cussEs } from 'cuss/es' +import { Language } from '@horizon-rs/language-guesser' + +const language = new Language() + +// Exported for the debugging CLI script +export const SIGNAL_RATINGS = [ + { + reduction: 1.0, + name: 'email-only', + validator: (comment) => isEmailOnly(comment), + }, + { + reduction: 0.2, + name: 'contains-email', + validator: (comment) => isContainingEmail(comment), + }, + { + reduction: 0.1, + name: 'url-only', + validator: (comment) => isURL(comment), + }, + { + reduction: 0.1, + name: 'numbers-only', + validator: (comment) => isNumbersOnly(comment), + }, + { + reduction: 0.1, + name: 'all-uppercase', + validator: (comment) => isAllUppercase(comment), + }, + { + reduction: 0.1, + name: 'too-short', + validator: (comment) => isTooShort(comment), + }, + { + reduction: 0.2, + name: 'not-language', + validator: (comment, language) => isNotLanguage(comment, language), + }, + { + reduction: 0.3, + name: 'cuss-words-likely', + validator: (comment, language) => isLikelyCussWords(comment, language), + }, + { + reduction: 0.1, + name: 'cuss-words-maybe', + validator: (comment, language) => isMaybeCussWords(comment, language), + }, +] + +export async function analyzeComment(text, language = 'en') { + const signals = [] + let rating = 1.0 + for (const { reduction, name, validator } of SIGNAL_RATINGS) { + if (validator(text, language)) { + signals.push(name) + rating -= reduction + } + if (rating <= 0) break + } + + return { signals, rating } +} + +function isEmailOnly(text) { + if (text.includes('@') && !/\s/.test(text.trim()) && !text.includes('://')) { + const atSigns = text.split('@').length + if (atSigns === 2) { + return true + } + } +} + +function isContainingEmail(text) { + if (text.includes('@') && !isEmailOnly(text)) { + // Don't use splitWords() here because `foo@example.com` will be + // split up into ['foo', 'example.com']. + return text.split(/\s+/g).some((word) => isEmailOnly(word)) + } + return false +} + +function isURL(text) { + if (!text.trim().includes(' ')) { + if (URL.canParse(text.trim())) return true + } +} + +function isNumbersOnly(text) { + return /^\d+$/.test(text.replace(/\s/g, '')) +} + +function isAllUppercase(text) { + return /[A-Z]/.test(text) && text === text.toUpperCase() +} + +function isTooShort(text) { + const split = text.trim().split(/\s+/) + if (split.length <= 1) { + // return !isNumbersOnly(text) && !isURL(text) && !isEmailOnly(text) && !isAllUppercase(text) + return true + } +} + +function isNotLanguage(text, language_) { + const bestGuess = language.guessBest(text.trim()) + if (!bestGuess) return true // Can happen if the text is just whitespace + // @horizon-rs/language-guesser is based on tri-grams and can lead + // to false positives. For example, it thinks that 'Thamk you ❤️🙏' is + // Haitian! And that 'I wanne robux 1000' is Polish! + // But that's because they are short and there's not enough clues to + // guess what language it is. You and I might know those are actually + // attempts to be English, despite the spelling. + // But are they useful comments? Given that this is just a signal, + // and not a hard blocker, it's more of a clue than a fact. + return bestGuess.alpha2 !== language_ +} + +function getCussWords(lang) { + switch (lang) { + case 'pt': + return cussPt + case 'fr': + return cussFr + case 'es': + return cussEs + default: + return cuss + } +} + +function isLikelyCussWords(text, language_, rating = 2) { + const cussWords = getCussWords(language_) + for (const word of splitWords(text, language_ || 'en')) { + if (cussWords[word] && cussWords[word] === rating) { + return true + } + } + return false +} + +function isMaybeCussWords(text, language_) { + return isLikelyCussWords(text, language_, 1) +} + +const segmenter = new Intl.Segmenter([], { granularity: 'word' }) + +function splitWords(text) { + const segmentedText = segmenter.segment(text) + return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment) +} diff --git a/src/events/middleware.js b/src/events/middleware.js index 5d31b12b0932..b69db73717a6 100644 --- a/src/events/middleware.js +++ b/src/events/middleware.js @@ -8,6 +8,7 @@ import { noCacheControl } from '#src/frame/middleware/cache-control.js' import { getJsonValidator } from '#src/tests/lib/validate-json-schema.js' import { formatErrors } from './lib/middleware-errors.js' import { publish as _publish } from './lib/hydro.js' +import { analyzeComment } from './analyze-comment.js' const router = express.Router() const OMIT_FIELDS = ['type'] @@ -90,18 +91,9 @@ router.post( return res.status(400).json({ message: 'Empty comment' }) } - const signals = [] - const rating = 1.0 + const { rating } = await analyzeComment(comment, locale) - // if (comment.includes('@') && !comment.includes(' ')) { - // // XXX Make it a simple email validator - // signals.push({ - // email: 'Looks like an email address', - // }) - // rating -= 0.1 - // } - - return res.json({ rating, signals }) + return res.json({ rating }) }), ) diff --git a/src/events/scripts/analyze-comments-csv.ts b/src/events/scripts/analyze-comments-csv.ts new file mode 100644 index 000000000000..893822157693 --- /dev/null +++ b/src/events/scripts/analyze-comments-csv.ts @@ -0,0 +1,100 @@ +/** + * This script is used to analyze posted survey comments in a CSV file. + * The CSV file is expected to have come from the Azure Data Explorer + * after having queries the `docs_v0_survey_event` table. + * + * + */ + +import fs from 'node:fs' +import util from 'node:util' + +import chalk from 'chalk' +import { parse } from 'csv-parse' +import { program } from 'commander' + +import { SIGNAL_RATINGS } from '../analyze-comment' + +type Options = { + outputFile: string + limit: string + random: boolean +} +program + .description('Analyze survey comments in a CSV file') + .option('-o, --output-file ', 'path to the output', 'stdout') + .option('--limit ', 'limit number of records analyzed', 'Infinity') + .option( + '--random', + 'randomize the lines analyzed (useful when limit is less than size of CSV)', + false, + ) + .argument('', 'path to the exported CSV file') + .action(main) + +program.parse(process.argv) + +async function main(csvFile: string[], options: Options) { + for (const file of csvFile) { + await analyzeFile(file, options) + } +} + +type Record = { + [key: string]: string | number +} + +async function analyzeFile(csvFile: string, options: Options) { + const parser = fs.createReadStream(csvFile).pipe( + parse({ + // Needed when parsing CSVs from the Azure Data Explorer + bom: true, + }), + ) + let headers: null | string[] = null + const records: Record[] = [] + for await (const record of parser) { + if (headers === null) { + headers = record as string[] + } else { + const obj: { + [key: string]: string + } = {} + for (let i = 0; i < headers.length; i++) { + obj[headers[i]] = record[i] + } + records.push(obj) + } + } + + const limit = parseInt(options.limit) + if (options.random) { + records.sort(() => Math.random() - 0.5) + } + for (const record of records.slice(0, limit)) { + const language = record.survey_comment_language || 'en' + let rating = 1.0 + let first = true + for (const { reduction, name, validator } of SIGNAL_RATINGS) { + const hit = validator(record.survey_comment, language) + if (hit) { + rating -= reduction + if (first) { + console.log(util.inspect(record.survey_comment)) + first = false + } + console.log(name.padEnd(10), reduction) + if (rating <= 0.0) { + break + } + } + } + if (rating !== 1.0) { + console.log(chalk.yellow(`Rating: ${rating}`)) + } else { + console.log(chalk.green('No rating reduction')) + } + + console.log('\n') + } +} diff --git a/src/events/tests/analyze-comments.js b/src/events/tests/analyze-comments.js new file mode 100644 index 000000000000..95988fb72f16 --- /dev/null +++ b/src/events/tests/analyze-comments.js @@ -0,0 +1,157 @@ +import { describe, expect, test } from 'vitest' + +import { analyzeComment } from '../analyze-comment.js' + +describe('analyzeComment', () => { + test('email only', async () => { + // Yes + { + const { signals, rating } = await analyzeComment(' foo@example.com \n') + expect(signals.includes('email-only')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + } + + // No + { + const { signals } = await analyzeComment('one@example.com\ntwo@example.com') + expect(signals.includes('email-only')).toBeFalsy() + } + { + const { signals } = await analyzeComment('one@two@thee') + expect(signals.includes('email-only')).toBeFalsy() + } + }) + + test('contains email', async () => { + const { signals, rating } = await analyzeComment(' foo@example.com and more words here\n') + expect(signals.includes('contains-email')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + }) + + test('url only', async () => { + // Yes + { + const { signals, rating } = await analyzeComment(' https://github.com ') + expect(signals.includes('url-only')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + } + { + const { signals, rating } = await analyzeComment( + ' http://user:pass@github.com/path?query=string#hash', + ) + expect(signals.includes('url-only')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + } + + // No + { + const { signals } = await analyzeComment('https://example.com but more text') + expect(signals.includes('url-only')).toBeFalsy() + } + { + const { signals } = await analyzeComment('foo.bar.com') + expect(signals.includes('url-only')).toBeFalsy() + } + }) + + test('numbers only', async () => { + // Yes + { + const { signals, rating } = await analyzeComment(' 1234 ') + expect(signals.includes('numbers-only')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + } + { + const { signals, rating } = await analyzeComment('123 456') + expect(signals.includes('numbers-only')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + } + + // No + { + const { signals } = await analyzeComment('123 fake street') + expect(signals.includes('numbers-only')).toBeFalsy() + } + { + const { signals } = await analyzeComment('00000-11111') + expect(signals.includes('numbers-only')).toBeFalsy() + } + }) + + test('all uppercase', async () => { + // Yes + { + const { signals, rating } = await analyzeComment(' SHOUTING ') + expect(signals.includes('all-uppercase')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + } + { + const { signals, rating } = await analyzeComment('ALS0 SHOUTING!') + expect(signals.includes('all-uppercase')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + } + + // No + { + const { signals } = await analyzeComment('Not All Uppercase') + expect(signals.includes('all-uppercase')).toBeFalsy() + } + }) + + test('too-short', async () => { + // Yes + { + const { signals, rating } = await analyzeComment('Oneword ') + expect(signals.includes('too-short')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + } + { + const { signals, rating } = await analyzeComment('.!?') + expect(signals.includes('too-short')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + } + + // No + { + const { signals } = await analyzeComment('A\nB') + expect(signals.includes('too-short')).toBeFalsy() + } + }) + + test('not-language', async () => { + // Yes + { + const { signals, rating } = await analyzeComment('Garçon') + expect(signals.includes('not-language')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + } + { + // example of a false positive + const { signals, rating } = await analyzeComment('english word') + expect(signals.includes('not-language')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + } + + // No + { + const { signals } = await analyzeComment('english words longer sentence this time') + expect(signals.includes('not-language')).toBeFalsy() + } + { + const { signals } = await analyzeComment('Garçon des la voituré', 'fr') + expect(signals.includes('not-language')).toBeFalsy() + } + }) + + test('cuss-words-likely', async () => { + const { signals, rating } = await analyzeComment('f*ck you'.replace('*', 'u')) + expect(signals.includes('cuss-words-likely')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + }) + + test('cuss-words-maybe', async () => { + const { signals, rating } = await analyzeComment('oh s**t'.replace('**', 'hi')) + expect(signals.includes('cuss-words-maybe')).toBeTruthy() + expect(rating).toBeLessThan(1.0) + }) +}) diff --git a/src/events/tests/middleware.js b/src/events/tests/middleware.js index 4d65542d02e0..480258f6ae69 100644 --- a/src/events/tests/middleware.js +++ b/src/events/tests/middleware.js @@ -117,7 +117,7 @@ describe('POST /events', () => { // These are mostly placeholder tests for now since most of the // implementation of this endpoint is not yet written. describe('POST /events/survey/preview/v1', () => { - test('should repond with 400 when no comment is provided', async () => { + test('should respond with 400 when no comment is provided', async () => { const body = JSON.stringify({ locale: 'en', url: '/quickstart', @@ -132,7 +132,7 @@ describe('POST /events/survey/preview/v1', () => { expect(res.statusCode).toBe(400) }) - test('should repond with 400 when comment is provided but empty', async () => { + test('should respond with 400 when comment is provided but empty', async () => { const body = JSON.stringify({ locale: 'en', url: '/quickstart', @@ -148,7 +148,7 @@ describe('POST /events/survey/preview/v1', () => { expect(res.statusCode).toBe(400) }) - test('should repond with 200 when comment is provided', async () => { + test('should respond with 200 when comment is provided', async () => { const body = JSON.stringify({ locale: 'en', url: '/quickstart', @@ -163,7 +163,6 @@ describe('POST /events/survey/preview/v1', () => { }) const respBody = JSON.parse(res.body) expect(res.statusCode).toBe(200) - expect(respBody.rating).toEqual(1.0) - expect(respBody.signals).toEqual([]) + expect(respBody.rating).toBeLessThanOrEqual(1.0) }) })