Skip to content

Commit b5714a0

Browse files
authored
Merge pull request #33337 from github/repo-sync
Repo sync
2 parents 7650023 + a518d9d commit b5714a0

File tree

7 files changed

+448
-16
lines changed

7 files changed

+448
-16
lines changed

package-lock.json

Lines changed: 23 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@
197197
"dependencies": {
198198
"@elastic/elasticsearch": "8.13.1",
199199
"@github/failbot": "0.8.3",
200+
"@horizon-rs/language-guesser": "0.1.1",
200201
"@octokit/plugin-retry": "6.0.1",
201202
"@octokit/request-error": "6.1.1",
202203
"@primer/behaviors": "^1.5.1",
@@ -216,6 +217,7 @@
216217
"connect-datadog": "0.0.9",
217218
"connect-timeout": "1.9.0",
218219
"cookie-parser": "^1.4.6",
220+
"cuss": "2.2.0",
219221
"dayjs": "^1.11.3",
220222
"dotenv": "^16.4.5",
221223
"escape-string-regexp": "5.0.0",
@@ -313,6 +315,7 @@
313315
"commander": "^12.1.0",
314316
"cross-env": "^7.0.3",
315317
"csp-parse": "0.0.2",
318+
"csv-parse": "5.5.6",
316319
"eslint": "8.57.0",
317320
"eslint-config-prettier": "9.1.0",
318321
"eslint-config-standard": "17.1.0",

src/events/analyze-comment.js

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
import { cuss } from 'cuss'
2+
import { cuss as cussPt } from 'cuss/pt'
3+
import { cuss as cussFr } from 'cuss/fr'
4+
import { cuss as cussEs } from 'cuss/es'
5+
import { Language } from '@horizon-rs/language-guesser'
6+
7+
const language = new Language()
8+
9+
// Exported for the debugging CLI script
10+
export const SIGNAL_RATINGS = [
11+
{
12+
reduction: 1.0,
13+
name: 'email-only',
14+
validator: (comment) => isEmailOnly(comment),
15+
},
16+
{
17+
reduction: 0.2,
18+
name: 'contains-email',
19+
validator: (comment) => isContainingEmail(comment),
20+
},
21+
{
22+
reduction: 0.1,
23+
name: 'url-only',
24+
validator: (comment) => isURL(comment),
25+
},
26+
{
27+
reduction: 0.1,
28+
name: 'numbers-only',
29+
validator: (comment) => isNumbersOnly(comment),
30+
},
31+
{
32+
reduction: 0.1,
33+
name: 'all-uppercase',
34+
validator: (comment) => isAllUppercase(comment),
35+
},
36+
{
37+
reduction: 0.1,
38+
name: 'too-short',
39+
validator: (comment) => isTooShort(comment),
40+
},
41+
{
42+
reduction: 0.2,
43+
name: 'not-language',
44+
validator: (comment, language) => isNotLanguage(comment, language),
45+
},
46+
{
47+
reduction: 0.3,
48+
name: 'cuss-words-likely',
49+
validator: (comment, language) => isLikelyCussWords(comment, language),
50+
},
51+
{
52+
reduction: 0.1,
53+
name: 'cuss-words-maybe',
54+
validator: (comment, language) => isMaybeCussWords(comment, language),
55+
},
56+
]
57+
58+
export async function analyzeComment(text, language = 'en') {
59+
const signals = []
60+
let rating = 1.0
61+
for (const { reduction, name, validator } of SIGNAL_RATINGS) {
62+
if (validator(text, language)) {
63+
signals.push(name)
64+
rating -= reduction
65+
}
66+
if (rating <= 0) break
67+
}
68+
69+
return { signals, rating }
70+
}
71+
72+
function isEmailOnly(text) {
73+
if (text.includes('@') && !/\s/.test(text.trim()) && !text.includes('://')) {
74+
const atSigns = text.split('@').length
75+
if (atSigns === 2) {
76+
return true
77+
}
78+
}
79+
}
80+
81+
function isContainingEmail(text) {
82+
if (text.includes('@') && !isEmailOnly(text)) {
83+
// Don't use splitWords() here because `foo@example.com` will be
84+
// split up into ['foo', 'example.com'].
85+
return text.split(/\s+/g).some((word) => isEmailOnly(word))
86+
}
87+
return false
88+
}
89+
90+
function isURL(text) {
91+
if (!text.trim().includes(' ')) {
92+
if (URL.canParse(text.trim())) return true
93+
}
94+
}
95+
96+
function isNumbersOnly(text) {
97+
return /^\d+$/.test(text.replace(/\s/g, ''))
98+
}
99+
100+
function isAllUppercase(text) {
101+
return /[A-Z]/.test(text) && text === text.toUpperCase()
102+
}
103+
104+
function isTooShort(text) {
105+
const split = text.trim().split(/\s+/)
106+
if (split.length <= 1) {
107+
// return !isNumbersOnly(text) && !isURL(text) && !isEmailOnly(text) && !isAllUppercase(text)
108+
return true
109+
}
110+
}
111+
112+
function isNotLanguage(text, language_) {
113+
const bestGuess = language.guessBest(text.trim())
114+
if (!bestGuess) return true // Can happen if the text is just whitespace
115+
// @horizon-rs/language-guesser is based on tri-grams and can lead
116+
// to false positives. For example, it thinks that 'Thamk you ❤️🙏' is
117+
// Haitian! And that 'I wanne robux 1000' is Polish!
118+
// But that's because they are short and there's not enough clues to
119+
// guess what language it is. You and I might know those are actually
120+
// attempts to be English, despite the spelling.
121+
// But are they useful comments? Given that this is just a signal,
122+
// and not a hard blocker, it's more of a clue than a fact.
123+
return bestGuess.alpha2 !== language_
124+
}
125+
126+
function getCussWords(lang) {
127+
switch (lang) {
128+
case 'pt':
129+
return cussPt
130+
case 'fr':
131+
return cussFr
132+
case 'es':
133+
return cussEs
134+
default:
135+
return cuss
136+
}
137+
}
138+
139+
function isLikelyCussWords(text, language_, rating = 2) {
140+
const cussWords = getCussWords(language_)
141+
for (const word of splitWords(text, language_ || 'en')) {
142+
if (cussWords[word] && cussWords[word] === rating) {
143+
return true
144+
}
145+
}
146+
return false
147+
}
148+
149+
function isMaybeCussWords(text, language_) {
150+
return isLikelyCussWords(text, language_, 1)
151+
}
152+
153+
const segmenter = new Intl.Segmenter([], { granularity: 'word' })
154+
155+
function splitWords(text) {
156+
const segmentedText = segmenter.segment(text)
157+
return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
158+
}

src/events/middleware.js

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { noCacheControl } from '#src/frame/middleware/cache-control.js'
88
import { getJsonValidator } from '#src/tests/lib/validate-json-schema.js'
99
import { formatErrors } from './lib/middleware-errors.js'
1010
import { publish as _publish } from './lib/hydro.js'
11+
import { analyzeComment } from './analyze-comment.js'
1112

1213
const router = express.Router()
1314
const OMIT_FIELDS = ['type']
@@ -90,18 +91,9 @@ router.post(
9091
return res.status(400).json({ message: 'Empty comment' })
9192
}
9293

93-
const signals = []
94-
const rating = 1.0
94+
const { rating } = await analyzeComment(comment, locale)
9595

96-
// if (comment.includes('@') && !comment.includes(' ')) {
97-
// // XXX Make it a simple email validator
98-
// signals.push({
99-
// email: 'Looks like an email address',
100-
// })
101-
// rating -= 0.1
102-
// }
103-
104-
return res.json({ rating, signals })
96+
return res.json({ rating })
10597
}),
10698
)
10799

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/**
2+
* This script is used to analyze posted survey comments in a CSV file.
3+
* The CSV file is expected to have come from the Azure Data Explorer
4+
* after having queries the `docs_v0_survey_event` table.
5+
*
6+
*
7+
*/
8+
9+
import fs from 'node:fs'
10+
import util from 'node:util'
11+
12+
import chalk from 'chalk'
13+
import { parse } from 'csv-parse'
14+
import { program } from 'commander'
15+
16+
import { SIGNAL_RATINGS } from '../analyze-comment'
17+
18+
type Options = {
19+
outputFile: string
20+
limit: string
21+
random: boolean
22+
}
23+
program
24+
.description('Analyze survey comments in a CSV file')
25+
.option('-o, --output-file <path>', 'path to the output', 'stdout')
26+
.option('--limit <number>', 'limit number of records analyzed', 'Infinity')
27+
.option(
28+
'--random',
29+
'randomize the lines analyzed (useful when limit is less than size of CSV)',
30+
false,
31+
)
32+
.argument('<csv-files...>', 'path to the exported CSV file')
33+
.action(main)
34+
35+
program.parse(process.argv)
36+
37+
async function main(csvFile: string[], options: Options) {
38+
for (const file of csvFile) {
39+
await analyzeFile(file, options)
40+
}
41+
}
42+
43+
type Record = {
44+
[key: string]: string | number
45+
}
46+
47+
async function analyzeFile(csvFile: string, options: Options) {
48+
const parser = fs.createReadStream(csvFile).pipe(
49+
parse({
50+
// Needed when parsing CSVs from the Azure Data Explorer
51+
bom: true,
52+
}),
53+
)
54+
let headers: null | string[] = null
55+
const records: Record[] = []
56+
for await (const record of parser) {
57+
if (headers === null) {
58+
headers = record as string[]
59+
} else {
60+
const obj: {
61+
[key: string]: string
62+
} = {}
63+
for (let i = 0; i < headers.length; i++) {
64+
obj[headers[i]] = record[i]
65+
}
66+
records.push(obj)
67+
}
68+
}
69+
70+
const limit = parseInt(options.limit)
71+
if (options.random) {
72+
records.sort(() => Math.random() - 0.5)
73+
}
74+
for (const record of records.slice(0, limit)) {
75+
const language = record.survey_comment_language || 'en'
76+
let rating = 1.0
77+
let first = true
78+
for (const { reduction, name, validator } of SIGNAL_RATINGS) {
79+
const hit = validator(record.survey_comment, language)
80+
if (hit) {
81+
rating -= reduction
82+
if (first) {
83+
console.log(util.inspect(record.survey_comment))
84+
first = false
85+
}
86+
console.log(name.padEnd(10), reduction)
87+
if (rating <= 0.0) {
88+
break
89+
}
90+
}
91+
}
92+
if (rating !== 1.0) {
93+
console.log(chalk.yellow(`Rating: ${rating}`))
94+
} else {
95+
console.log(chalk.green('No rating reduction'))
96+
}
97+
98+
console.log('\n')
99+
}
100+
}

0 commit comments

Comments
 (0)