Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -848,5 +848,52 @@ class MyStr(str):
self.assertIs(type(normalize(form, MyStr(input_str))), str)


class GraphemeBreakTest(unittest.TestCase):
@staticmethod
def check_version(testfile):
hdr = testfile.readline()
return unicodedata.unidata_version in hdr
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does the file header look like?

With string contains tests, I worry about things like "8.0" in "18.0" matching wrongly. Could the full line be compared?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# GraphemeBreakTest-17.0.0.txt

We have the same check for normalization tests.


@requires_resource('network')
def test_grapheme_break(self):
TESTDATAFILE = "auxiliary/GraphemeBreakTest.txt"
TESTDATAURL = f"https://www.unicode.org/Public/{unicodedata.unidata_version}/ucd/{TESTDATAFILE}"

# Hit the exception early
try:
testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
check=self.check_version)
except PermissionError:
self.skipTest(f"Permission error when downloading {TESTDATAURL} "
f"into the test data directory")
except (OSError, HTTPException) as exc:
self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")

with testdata:
self.run_grapheme_break_tests(testdata, unicodedata)

def run_grapheme_break_tests(self, testdata, ucd):
part = None
part1_data = set()

for line in testdata:
line, _, comment = line.partition('#')
line = line.strip()
if not line:
continue
comment = comment.strip()

chunks = []
for field in line.replace('×', ' ').split():
if field == '÷':
chunks.append('')
else:
chunks[-1] += chr(int(field, 16))
self.assertEqual(chunks.pop(), '', line)
with self.subTest(line):
result = list(unicodedata.iter_graphemes(''.join(chunks)))
self.assertEqual(result, chunks, comment)


if __name__ == "__main__":
unittest.main()
1 change: 1 addition & 0 deletions Misc/ACKS
Original file line number Diff line number Diff line change
Expand Up @@ -1662,6 +1662,7 @@ Victor Salgado
Rich Salz
Kevin Samborn
Adrian Sampson
Guillaume Sanchez
Nevada Sanchez
James Sanders
Ilya Sandler
Expand Down
99 changes: 98 additions & 1 deletion Modules/clinic/unicodedata.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading