Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ third_party/vulkan-loader
third_party/vulkan-validationlayers/
.vs

*.pyc

# Vim swap files
[._]*.s[a-w][a-z]

Expand Down
179 changes: 179 additions & 0 deletions tools/check_language.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#!/usr/bin/env python

# Copyright 2020 The Amber Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do Google practices allow using SPDX IDs instead of longer license statements here? Considering incorporating this in the Vulkan repositories per suggestion from @zhangyiwei and that would be a bit nicer for us.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My reading is we need to include this full block (https://opensource.google/docs/releasing/preparing/#license-headers)

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that. Any idea who I'd contact to ask about that policy - Chris DiBona? I've had more than enough interactions with IP lawyers not to expect a change or a quick answer, but still worth posing the ask.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess Chris would be the right person?

# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Script to check files for inclusive language. The script will scan all files
and flag non-inclusive terminology which is identified.

Usage, run the script from a folder and the script will scan down through that
folder.
"""

import fnmatch
import os
import re
import sys

REGEXES = [
r"(?i)black[-_]?list",
r"(?i)white[-_]?list",
r"(?i)gr[ea]y[-_]?list",
r"(?i)(first class citizen)",
r"(?i)black[-_]?hat",
r"(?i)white[-_]?hat",
r"(?i)gr[ea]y[-_]?hat",
r"(?i)master",
r"(?i)slave",
r"(?i)\bhim\b",
r"(?i)\bhis\b",
r"(?i)\bshe\b",
r"(?i)\bher\b",
r"(?i)\bhers\b",
r"(?i)\bman\b",
r"(?i)\bwoman\b",
r"(?i)\she\s",
r"(?i)\she$",
r"(?i)^he\s",
r"(?i)^he$",
r"(?i)\she['|\u2019]d\s",
r"(?i)\she['|\u2019]d$",
r"(?i)^he['|\u2019]d\s",
r"(?i)^he['|\u2019]d$",
r"(?i)\she['|\u2019]s\s",
r"(?i)\she['|\u2019]s$",
r"(?i)^he['|\u2019]s\s",
r"(?i)^he['|\u2019]s$",
r"(?i)\she['|\u2019]ll\s",
r"(?i)\she['|\u2019]ll$",
r"(?i)^he['|\u2019]ll\s",
r"(?i)^he['|\u2019]ll$",
r"(?i)grandfather",
r"(?i)\bmitm\b",
r"(?i)\bcrazy\b",
r"(?i)\binsane\b",
r"(?i)\bblind\sto\b",
r"(?i)\bflying\sblind\b",
r"(?i)\bblind\seye\b",
r"(?i)\bcripple\b",
r"(?i)\bcrippled\b",
r"(?i)\bdumb\b",
r"(?i)\bdummy\b",
r"(?i)\bparanoid\b",
r"(?i)\bsane\b",
r"(?i)\bsanity\b",
r"(?i)red[-_]?line",
]

SUPPRESSIONS = [
r"(?i)MS_SLAVE",
r"(?i)man[ -_]?page",
]


REGEX_LIST = []
for reg in REGEXES:
REGEX_LIST.append(re.compile(reg))

SUPPRESSION_LIST = []
for supp in SUPPRESSIONS:
SUPPRESSION_LIST.append(re.compile(supp))

def find(top, filename_glob, skip_glob_list):
"""Returns files in the tree rooted at top matching filename_glob but not
in directories matching skip_glob_list."""

file_list = []
for path, dirs, files in os.walk(top):
for glob in skip_glob_list:
for match in fnmatch.filter(dirs, glob):
dirs.remove(match)
for filename in fnmatch.filter(files, filename_glob):
if filename == os.path.basename(__file__):
continue
file_list.append(os.path.join(path, filename))
return file_list


def filtered_descendants(glob):
"""Returns glob-matching filenames under the current directory, but skips
some irrelevant paths."""
return find('.', glob, ['third_party', 'external', 'build*', 'out*',
'CompilerIdCXX', '.git'])

def check_match(filename, contents):
"""Check if contents contains any matching entries"""
ret = False
for reg in REGEX_LIST:
match = reg.search(contents)
if match:
suppressed = False
for supp in SUPPRESSION_LIST:
idx = match.start()
supp_match = supp.match(contents[idx:])
if supp_match:
suppressed = True

# This is a hack to handle the MS_ prefix that is needed
# to check for. Find a better way if we get more suppressions
# which modify the prefix of the string
if idx >= 3:
supp_match = supp.match(contents[idx - 3:])
if supp_match:
suppressed = True

if not suppressed:
# No matching suppression.
print("{}: found non-inclusive language: {}".format(
filename, match.group(0)))
ret = True

return ret


def alert_if_lang_matches(glob):
"""Prints names of all files matching non-inclusive language.

Finds all glob-matching files under the current directory and checks if they
contain the language pattern. Prints the names of all the files that
match.

Returns the total number of file names printed.
"""
verbose = False
printed_count = 0
for file in filtered_descendants(glob):
has_match = False
try:
with open(file, 'r', encoding='utf8') as contents:
if check_match(file, contents.read()):
printed_count += 1
except:
if verbose:
print("skipping {}".format(file))

return printed_count


def main():
globs = ['*']
count = 0
for glob in globs:
count += alert_if_lang_matches(glob)

sys.exit(count > 0)

if __name__ == '__main__':
main()
61 changes: 61 additions & 0 deletions tools/check_language_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env python

# Copyright 2020 The Amber Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Unit tests for check_language.py."""

import os
import sys
import unittest

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import check_language

class TestCheckLanguage(unittest.TestCase):
def testMatches(self):
tests = ["blacklist", "black-list", "black_list", "whitelist",
"white-list", "white_list", "greylist", "grey-list", "grey_list",
"graylist", "gray-list", "gray_list", "first class citizen",
"blackhat", "black-hat", "black_hat", "whitehat", "white-hat",
"white_hat", "greyhat", "grey-hat", "grey_hat", "grayhat",
"gray-hat", "gray_hat", "master", "slave", "him", "his", "she",
"her", "hers", "man", "woman", "he", "he'd", "he's", "he'll",
"he\u2019d", "he\u2019s", "he\u2019ll",
"grandfather", "mitm", "crazy", "insane", "blind to",
"flying blind", "blind eye", "cripple", "crippled", "dumb",
"dummy", "paranoid", "sane", "sanity", "redline", "red-line",
"red_line"]

for word in tests:
self.assertTrue(
check_language.check_match("", "this is a " + word + " attempt"), word)


def testSuppression(self):
self.assertFalse(check_language.check_match("", "in the man-pages"))
self.assertFalse(check_language.check_match("", "the MS_SLAVE test"))


def testMatchStartofFileWhenRequireSpace(self):
self.assertTrue(check_language.check_match("", "he said"))


def testMatchOverNewline(self):
self.assertTrue(check_language.check_match("", "flying\nblind"))


if __name__ == '__main__':
unittest.main()