From a518d9d58a94b673a229f8b72968512c560f25a6 Mon Sep 17 00:00:00 2001
From: Peter Bengtsson <peterbe@github.com>
Date: Tue, 4 Jun 2024 14:27:43 -0400
Subject: [PATCH] Judge a survey comments and give them a rating (#50891)

Co-authored-by: Rachael Sewell <rachmari@github.com>
---
 package-lock.json                          |  23 +++
 package.json                               |   3 +
 src/events/analyze-comment.js              | 158 +++++++++++++++++++++
 src/events/middleware.js                   |  14 +-
 src/events/scripts/analyze-comments-csv.ts | 100 +++++++++++++
 src/events/tests/analyze-comments.js       | 157 ++++++++++++++++++++
 src/events/tests/middleware.js             |   9 +-
 7 files changed, 448 insertions(+), 16 deletions(-)
 create mode 100644 src/events/analyze-comment.js
 create mode 100644 src/events/scripts/analyze-comments-csv.ts
 create mode 100644 src/events/tests/analyze-comments.js

diff --git a/package-lock.json b/package-lock.json
index abf218c34183..9179b52ccc0a 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -10,6 +10,7 @@
       "dependencies": {
         "@elastic/elasticsearch": "8.13.1",
         "@github/failbot": "0.8.3",
+        "@horizon-rs/language-guesser": "0.1.1",
         "@octokit/plugin-retry": "6.0.1",
         "@octokit/request-error": "6.1.1",
         "@primer/behaviors": "^1.5.1",
@@ -29,6 +30,7 @@
         "connect-datadog": "0.0.9",
         "connect-timeout": "1.9.0",
         "cookie-parser": "^1.4.6",
+        "cuss": "2.2.0",
         "dayjs": "^1.11.3",
         "dotenv": "^16.4.5",
         "escape-string-regexp": "5.0.0",
@@ -126,6 +128,7 @@
         "commander": "^12.1.0",
         "cross-env": "^7.0.3",
         "csp-parse": "0.0.2",
+        "csv-parse": "5.5.6",
         "eslint": "8.57.0",
         "eslint-config-prettier": "9.1.0",
         "eslint-config-standard": "17.1.0",
@@ -1478,6 +1481,11 @@
         "@hapi/hoek": "^9.0.0"
       }
     },
+    "node_modules/@horizon-rs/language-guesser": {
+      "version": "0.1.1",
+      "resolved": "https://registry.npmjs.org/@horizon-rs/language-guesser/-/language-guesser-0.1.1.tgz",
+      "integrity": "sha512-tWTHkmSlP3QfFlWlsMzUDMeAqs8vXlAf91gCTDPprBwiY4ptbM0YzUUYCoRrFDgAxFVovCxBzlWoDQv7OyimKg=="
+    },
     "node_modules/@humanwhocodes/config-array": {
       "version": "0.11.14",
       "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.14.tgz",
@@ -5204,6 +5212,21 @@
       "version": "3.0.9",
       "license": "MIT"
     },
+    "node_modules/csv-parse": {
+      "version": "5.5.6",
+      "resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-5.5.6.tgz",
+      "integrity": "sha512-uNpm30m/AGSkLxxy7d9yRXpJQFrZzVWLFBkS+6ngPcZkw/5k3L/jjFuj7tVnEpRn+QgmiXr21nDlhCiUK4ij2A==",
+      "dev": true
+    },
+    "node_modules/cuss": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/cuss/-/cuss-2.2.0.tgz",
+      "integrity": "sha512-3hlHOhMiZ6YdHY5LPUhfxlx1Pj14eGttv2l9ADB1Lkv7e/us5XD798wrVLJ9DHmDO8SzCDuA+ItByFZ3M1dIYg==",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
     "node_modules/damerau-levenshtein": {
       "version": "1.0.8",
       "resolved": "https://registry.npmjs.org/damerau-levenshtein/-/damerau-levenshtein-1.0.8.tgz",
diff --git a/package.json b/package.json
index 47da24c1c227..50868af80c18 100644
--- a/package.json
+++ b/package.json
@@ -197,6 +197,7 @@
   "dependencies": {
     "@elastic/elasticsearch": "8.13.1",
     "@github/failbot": "0.8.3",
+    "@horizon-rs/language-guesser": "0.1.1",
     "@octokit/plugin-retry": "6.0.1",
     "@octokit/request-error": "6.1.1",
     "@primer/behaviors": "^1.5.1",
@@ -216,6 +217,7 @@
     "connect-datadog": "0.0.9",
     "connect-timeout": "1.9.0",
     "cookie-parser": "^1.4.6",
+    "cuss": "2.2.0",
     "dayjs": "^1.11.3",
     "dotenv": "^16.4.5",
     "escape-string-regexp": "5.0.0",
@@ -313,6 +315,7 @@
     "commander": "^12.1.0",
     "cross-env": "^7.0.3",
     "csp-parse": "0.0.2",
+    "csv-parse": "5.5.6",
     "eslint": "8.57.0",
     "eslint-config-prettier": "9.1.0",
     "eslint-config-standard": "17.1.0",
diff --git a/src/events/analyze-comment.js b/src/events/analyze-comment.js
new file mode 100644
index 000000000000..fd0a3faa5279
--- /dev/null
+++ b/src/events/analyze-comment.js
@@ -0,0 +1,158 @@
+import { cuss } from 'cuss'
+import { cuss as cussPt } from 'cuss/pt'
+import { cuss as cussFr } from 'cuss/fr'
+import { cuss as cussEs } from 'cuss/es'
+import { Language } from '@horizon-rs/language-guesser'
+
+const language = new Language()
+
+// Exported for the debugging CLI script
+export const SIGNAL_RATINGS = [
+  {
+    reduction: 1.0,
+    name: 'email-only',
+    validator: (comment) => isEmailOnly(comment),
+  },
+  {
+    reduction: 0.2,
+    name: 'contains-email',
+    validator: (comment) => isContainingEmail(comment),
+  },
+  {
+    reduction: 0.1,
+    name: 'url-only',
+    validator: (comment) => isURL(comment),
+  },
+  {
+    reduction: 0.1,
+    name: 'numbers-only',
+    validator: (comment) => isNumbersOnly(comment),
+  },
+  {
+    reduction: 0.1,
+    name: 'all-uppercase',
+    validator: (comment) => isAllUppercase(comment),
+  },
+  {
+    reduction: 0.1,
+    name: 'too-short',
+    validator: (comment) => isTooShort(comment),
+  },
+  {
+    reduction: 0.2,
+    name: 'not-language',
+    validator: (comment, language) => isNotLanguage(comment, language),
+  },
+  {
+    reduction: 0.3,
+    name: 'cuss-words-likely',
+    validator: (comment, language) => isLikelyCussWords(comment, language),
+  },
+  {
+    reduction: 0.1,
+    name: 'cuss-words-maybe',
+    validator: (comment, language) => isMaybeCussWords(comment, language),
+  },
+]
+
+export async function analyzeComment(text, language = 'en') {
+  const signals = []
+  let rating = 1.0
+  for (const { reduction, name, validator } of SIGNAL_RATINGS) {
+    if (validator(text, language)) {
+      signals.push(name)
+      rating -= reduction
+    }
+    if (rating <= 0) break
+  }
+
+  return { signals, rating }
+}
+
+function isEmailOnly(text) {
+  if (text.includes('@') && !/\s/.test(text.trim()) && !text.includes('://')) {
+    const atSigns = text.split('@').length
+    if (atSigns === 2) {
+      return true
+    }
+  }
+}
+
+function isContainingEmail(text) {
+  if (text.includes('@') && !isEmailOnly(text)) {
+    // Don't use splitWords() here because `foo@example.com` will be
+    // split up into ['foo', 'example.com'].
+    return text.split(/\s+/g).some((word) => isEmailOnly(word))
+  }
+  return false
+}
+
+function isURL(text) {
+  if (!text.trim().includes(' ')) {
+    if (URL.canParse(text.trim())) return true
+  }
+}
+
+function isNumbersOnly(text) {
+  return /^\d+$/.test(text.replace(/\s/g, ''))
+}
+
+function isAllUppercase(text) {
+  return /[A-Z]/.test(text) && text === text.toUpperCase()
+}
+
+function isTooShort(text) {
+  const split = text.trim().split(/\s+/)
+  if (split.length <= 1) {
+    // return !isNumbersOnly(text) && !isURL(text) && !isEmailOnly(text) && !isAllUppercase(text)
+    return true
+  }
+}
+
+function isNotLanguage(text, language_) {
+  const bestGuess = language.guessBest(text.trim())
+  if (!bestGuess) return true // Can happen if the text is just whitespace
+  // @horizon-rs/language-guesser is based on tri-grams and can lead
+  // to false positives. For example, it thinks that 'Thamk you ❤️🙏' is
+  // Haitian! And that 'I wanne robux 1000' is Polish!
+  // But that's because they are short and there's not enough clues to
+  // guess what language it is. You and I might know those are actually
+  // attempts to be English, despite the spelling.
+  // But are they useful comments? Given that this is just a signal,
+  // and not a hard blocker, it's more of a clue than a fact.
+  return bestGuess.alpha2 !== language_
+}
+
+function getCussWords(lang) {
+  switch (lang) {
+    case 'pt':
+      return cussPt
+    case 'fr':
+      return cussFr
+    case 'es':
+      return cussEs
+    default:
+      return cuss
+  }
+}
+
+function isLikelyCussWords(text, language_, rating = 2) {
+  const cussWords = getCussWords(language_)
+  for (const word of splitWords(text, language_ || 'en')) {
+    if (cussWords[word] && cussWords[word] === rating) {
+      return true
+    }
+  }
+  return false
+}
+
+function isMaybeCussWords(text, language_) {
+  return isLikelyCussWords(text, language_, 1)
+}
+
+const segmenter = new Intl.Segmenter([], { granularity: 'word' })
+
+function splitWords(text) {
+  const segmentedText = segmenter.segment(text)
+  return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
+}
diff --git a/src/events/middleware.js b/src/events/middleware.js
index 5d31b12b0932..b69db73717a6 100644
--- a/src/events/middleware.js
+++ b/src/events/middleware.js
@@ -8,6 +8,7 @@ import { noCacheControl } from '#src/frame/middleware/cache-control.js'
 import { getJsonValidator } from '#src/tests/lib/validate-json-schema.js'
 import { formatErrors } from './lib/middleware-errors.js'
 import { publish as _publish } from './lib/hydro.js'
+import { analyzeComment } from './analyze-comment.js'
 
 const router = express.Router()
 const OMIT_FIELDS = ['type']
@@ -90,18 +91,9 @@ router.post(
       return res.status(400).json({ message: 'Empty comment' })
     }
 
-    const signals = []
-    const rating = 1.0
+    const { rating } = await analyzeComment(comment, locale)
 
-    // if (comment.includes('@') && !comment.includes(' ')) {
-    //   // XXX Make it a simple email validator
-    //   signals.push({
-    //     email: 'Looks like an email address',
-    //   })
-    //   rating -= 0.1
-    // }
-
-    return res.json({ rating, signals })
+    return res.json({ rating })
   }),
 )
 
diff --git a/src/events/scripts/analyze-comments-csv.ts b/src/events/scripts/analyze-comments-csv.ts
new file mode 100644
index 000000000000..893822157693
--- /dev/null
+++ b/src/events/scripts/analyze-comments-csv.ts
@@ -0,0 +1,100 @@
+/**
+ * This script is used to analyze posted survey comments in a CSV file.
+ * The CSV file is expected to have come from the Azure Data Explorer
+ * after having queries the `docs_v0_survey_event` table.
+ *
+ *
+ */
+
+import fs from 'node:fs'
+import util from 'node:util'
+
+import chalk from 'chalk'
+import { parse } from 'csv-parse'
+import { program } from 'commander'
+
+import { SIGNAL_RATINGS } from '../analyze-comment'
+
+type Options = {
+  outputFile: string
+  limit: string
+  random: boolean
+}
+program
+  .description('Analyze survey comments in a CSV file')
+  .option('-o, --output-file <path>', 'path to the output', 'stdout')
+  .option('--limit <number>', 'limit number of records analyzed', 'Infinity')
+  .option(
+    '--random',
+    'randomize the lines analyzed (useful when limit is less than size of CSV)',
+    false,
+  )
+  .argument('<csv-files...>', 'path to the exported CSV file')
+  .action(main)
+
+program.parse(process.argv)
+
+async function main(csvFile: string[], options: Options) {
+  for (const file of csvFile) {
+    await analyzeFile(file, options)
+  }
+}
+
+type Record = {
+  [key: string]: string | number
+}
+
+async function analyzeFile(csvFile: string, options: Options) {
+  const parser = fs.createReadStream(csvFile).pipe(
+    parse({
+      // Needed when parsing CSVs from the Azure Data Explorer
+      bom: true,
+    }),
+  )
+  let headers: null | string[] = null
+  const records: Record[] = []
+  for await (const record of parser) {
+    if (headers === null) {
+      headers = record as string[]
+    } else {
+      const obj: {
+        [key: string]: string
+      } = {}
+      for (let i = 0; i < headers.length; i++) {
+        obj[headers[i]] = record[i]
+      }
+      records.push(obj)
+    }
+  }
+
+  const limit = parseInt(options.limit)
+  if (options.random) {
+    records.sort(() => Math.random() - 0.5)
+  }
+  for (const record of records.slice(0, limit)) {
+    const language = record.survey_comment_language || 'en'
+    let rating = 1.0
+    let first = true
+    for (const { reduction, name, validator } of SIGNAL_RATINGS) {
+      const hit = validator(record.survey_comment, language)
+      if (hit) {
+        rating -= reduction
+        if (first) {
+          console.log(util.inspect(record.survey_comment))
+          first = false
+        }
+        console.log(name.padEnd(10), reduction)
+        if (rating <= 0.0) {
+          break
+        }
+      }
+    }
+    if (rating !== 1.0) {
+      console.log(chalk.yellow(`Rating: ${rating}`))
+    } else {
+      console.log(chalk.green('No rating reduction'))
+    }
+
+    console.log('\n')
+  }
+}
diff --git a/src/events/tests/analyze-comments.js b/src/events/tests/analyze-comments.js
new file mode 100644
index 000000000000..95988fb72f16
--- /dev/null
+++ b/src/events/tests/analyze-comments.js
@@ -0,0 +1,157 @@
+import { describe, expect, test } from 'vitest'
+
+import { analyzeComment } from '../analyze-comment.js'
+
+describe('analyzeComment', () => {
+  test('email only', async () => {
+    // Yes
+    {
+      const { signals, rating } = await analyzeComment(' foo@example.com \n')
+      expect(signals.includes('email-only')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+
+    // No
+    {
+      const { signals } = await analyzeComment('one@example.com\ntwo@example.com')
+      expect(signals.includes('email-only')).toBeFalsy()
+    }
+    {
+      const { signals } = await analyzeComment('one@two@thee')
+      expect(signals.includes('email-only')).toBeFalsy()
+    }
+  })
+
+  test('contains email', async () => {
+    const { signals, rating } = await analyzeComment(' foo@example.com and more words here\n')
+    expect(signals.includes('contains-email')).toBeTruthy()
+    expect(rating).toBeLessThan(1.0)
+  })
+
+  test('url only', async () => {
+    // Yes
+    {
+      const { signals, rating } = await analyzeComment(' https://github.com ')
+      expect(signals.includes('url-only')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+    {
+      const { signals, rating } = await analyzeComment(
+        ' http://user:pass@github.com/path?query=string#hash',
+      )
+      expect(signals.includes('url-only')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+
+    // No
+    {
+      const { signals } = await analyzeComment('https://example.com but more text')
+      expect(signals.includes('url-only')).toBeFalsy()
+    }
+    {
+      const { signals } = await analyzeComment('foo.bar.com')
+      expect(signals.includes('url-only')).toBeFalsy()
+    }
+  })
+
+  test('numbers only', async () => {
+    // Yes
+    {
+      const { signals, rating } = await analyzeComment(' 1234 ')
+      expect(signals.includes('numbers-only')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+    {
+      const { signals, rating } = await analyzeComment('123 456')
+      expect(signals.includes('numbers-only')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+
+    // No
+    {
+      const { signals } = await analyzeComment('123 fake street')
+      expect(signals.includes('numbers-only')).toBeFalsy()
+    }
+    {
+      const { signals } = await analyzeComment('00000-11111')
+      expect(signals.includes('numbers-only')).toBeFalsy()
+    }
+  })
+
+  test('all uppercase', async () => {
+    // Yes
+    {
+      const { signals, rating } = await analyzeComment(' SHOUTING ')
+      expect(signals.includes('all-uppercase')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+    {
+      const { signals, rating } = await analyzeComment('ALS0 SHOUTING!')
+      expect(signals.includes('all-uppercase')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+
+    // No
+    {
+      const { signals } = await analyzeComment('Not All Uppercase')
+      expect(signals.includes('all-uppercase')).toBeFalsy()
+    }
+  })
+
+  test('too-short', async () => {
+    // Yes
+    {
+      const { signals, rating } = await analyzeComment('Oneword ')
+      expect(signals.includes('too-short')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+    {
+      const { signals, rating } = await analyzeComment('.!?')
+      expect(signals.includes('too-short')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+
+    // No
+    {
+      const { signals } = await analyzeComment('A\nB')
+      expect(signals.includes('too-short')).toBeFalsy()
+    }
+  })
+
+  test('not-language', async () => {
+    // Yes
+    {
+      const { signals, rating } = await analyzeComment('Garçon')
+      expect(signals.includes('not-language')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+    {
+      // example of a false positive
+      const { signals, rating } = await analyzeComment('english word')
+      expect(signals.includes('not-language')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+
+    // No
+    {
+      const { signals } = await analyzeComment('english words longer sentence this time')
+      expect(signals.includes('not-language')).toBeFalsy()
+    }
+    {
+      const { signals } = await analyzeComment('Garçon des la voituré', 'fr')
+      expect(signals.includes('not-language')).toBeFalsy()
+    }
+  })
+
+  test('cuss-words-likely', async () => {
+    const { signals, rating } = await analyzeComment('f*ck you'.replace('*', 'u'))
+    expect(signals.includes('cuss-words-likely')).toBeTruthy()
+    expect(rating).toBeLessThan(1.0)
+  })
+
+  test('cuss-words-maybe', async () => {
+    const { signals, rating } = await analyzeComment('oh s**t'.replace('**', 'hi'))
+    expect(signals.includes('cuss-words-maybe')).toBeTruthy()
+    expect(rating).toBeLessThan(1.0)
+  })
+})
diff --git a/src/events/tests/middleware.js b/src/events/tests/middleware.js
index 4d65542d02e0..480258f6ae69 100644
--- a/src/events/tests/middleware.js
+++ b/src/events/tests/middleware.js
@@ -117,7 +117,7 @@ describe('POST /events', () => {
 // These are mostly placeholder tests for now since most of the
 // implementation of this endpoint is not yet written.
 describe('POST /events/survey/preview/v1', () => {
-  test('should repond with 400 when no comment is provided', async () => {
+  test('should respond with 400 when no comment is provided', async () => {
     const body = JSON.stringify({
       locale: 'en',
       url: '/quickstart',
@@ -132,7 +132,7 @@ describe('POST /events/survey/preview/v1', () => {
     expect(res.statusCode).toBe(400)
   })
 
-  test('should repond with 400 when comment is provided but empty', async () => {
+  test('should respond with 400 when comment is provided but empty', async () => {
     const body = JSON.stringify({
       locale: 'en',
       url: '/quickstart',
@@ -148,7 +148,7 @@ describe('POST /events/survey/preview/v1', () => {
     expect(res.statusCode).toBe(400)
   })
 
-  test('should repond with 200 when comment is provided', async () => {
+  test('should respond with 200 when comment is provided', async () => {
     const body = JSON.stringify({
       locale: 'en',
       url: '/quickstart',
@@ -163,7 +163,6 @@ describe('POST /events/survey/preview/v1', () => {
     })
     const respBody = JSON.parse(res.body)
     expect(res.statusCode).toBe(200)
-    expect(respBody.rating).toEqual(1.0)
-    expect(respBody.signals).toEqual([])
+    expect(respBody.rating).toBeLessThanOrEqual(1.0)
   })
 })