From 97300216410daf52b08a6a0b9270dd5b742f8c98 Mon Sep 17 00:00:00 2001 From: Marcono1234 Date: Fri, 29 Oct 2021 14:23:07 +0200 Subject: [PATCH 1/4] Java: Add `CharacterLiteral.getCodePointValue()` --- java/ql/lib/semmle/code/java/Expr.qll | 20 ++++++- java/ql/lib/semmle/code/java/Type.qll | 5 +- .../constants/constants/Values.java | 4 +- .../constants/getIntValue.expected | 2 + .../literals/charLiterals/CharLiterals.java | 1 + .../charLiterals/charLiterals.expected | 40 ++++++------- .../literals/charLiterals/charLiterals.ql | 2 +- .../stringLiterals/StringLiterals.java | 1 + .../stringLiterals/stringLiterals.expected | 56 +++++++++---------- 9 files changed, 76 insertions(+), 55 deletions(-) diff --git a/java/ql/lib/semmle/code/java/Expr.qll b/java/ql/lib/semmle/code/java/Expr.qll index f2a309904ad1..19f41772ff68 100755 --- a/java/ql/lib/semmle/code/java/Expr.qll +++ b/java/ql/lib/semmle/code/java/Expr.qll @@ -298,19 +298,20 @@ class CompileTimeConstantExpr extends Expr { * * Note that this does not handle the following cases: * - * - values of type `long`, - * - `char` literals. + * - values of type `long`. */ cached int getIntValue() { exists(IntegralType t | this.getType() = t | t.getName().toLowerCase() != "long") and ( exists(string lit | lit = this.(Literal).getValue() | - // `char` literals may get parsed incorrectly, so disallow. + // Don't parse `char` literal as int, instead get its code point value (see below) not this instanceof CharacterLiteral and result = lit.toInt() ) or + result = this.(CharacterLiteral).getCodePointValue() + or exists(CastExpr cast, int val | cast = this and val = cast.getExpr().(CompileTimeConstantExpr).getIntValue() | @@ -719,6 +720,19 @@ class DoubleLiteral extends Literal, @doubleliteral { /** A character literal. For example, `'\n'`. */ class CharacterLiteral extends Literal, @characterliteral { override string getAPrimaryQlClass() { result = "CharacterLiteral" } + + /** + * Gets a string which consists of the single character represented by + * this literal. + */ + override string getValue() { result = super.getValue() } + + /** + * Gets the Unicode code point value of the character represented by + * this literal. The result is the same as if the Java code had cast + * the character to an `int`. + */ + int getCodePointValue() { result = any(int i | i.toUnicode() = getValue()) } } /** diff --git a/java/ql/lib/semmle/code/java/Type.qll b/java/ql/lib/semmle/code/java/Type.qll index 1d0658595b25..7ceec7483d12 100755 --- a/java/ql/lib/semmle/code/java/Type.qll +++ b/java/ql/lib/semmle/code/java/Type.qll @@ -1123,7 +1123,10 @@ predicate erasedHaveIntersection(RefType t1, RefType t2) { t2 = erase(_) } -/** An integral type, which may be either a primitive or a boxed type. */ +/** + * An integral type, which may be either a primitive or a boxed type. + * This includes the types `char` and `Character`. + */ class IntegralType extends Type { IntegralType() { exists(string name | diff --git a/java/ql/test/library-tests/constants/constants/Values.java b/java/ql/test/library-tests/constants/constants/Values.java index ad061e96c777..7cf88fb9ad82 100644 --- a/java/ql/test/library-tests/constants/constants/Values.java +++ b/java/ql/test/library-tests/constants/constants/Values.java @@ -16,7 +16,7 @@ void values(final int notConstant) { int binary_literal = 0b101010; //42 int negative_binary_literal = -0b101010; //-42 int binary_literal_underscores = 0b1_0101_0; //42 - char char_literal = '*'; //Not handled + char char_literal = '*'; //42 long long_literal = 42L; //Not handled boolean boolean_literal = true; //true Integer boxed_int = new Integer(42); //Not handled @@ -30,7 +30,7 @@ void values(final int notConstant) { byte downcast_byte_4 = (byte) 214; // -42 byte downcast_byte_5 = (byte) (-214); // 42 short downcast_short = (short) 32768; // -32768 - int cast_of_non_constant = (int) '*'; //Not handled + int cast_of_non_constant = (int) '*'; //42 long cast_to_long = (long) 42; //Not handled int unary_plus = +42; //42 diff --git a/java/ql/test/library-tests/constants/getIntValue.expected b/java/ql/test/library-tests/constants/getIntValue.expected index bee36f5122ba..8dfd0fc7841a 100644 --- a/java/ql/test/library-tests/constants/getIntValue.expected +++ b/java/ql/test/library-tests/constants/getIntValue.expected @@ -9,6 +9,7 @@ | constants/Values.java:16:30:16:37 | 0b101010 | 42 | | constants/Values.java:17:39:17:47 | -... | -42 | | constants/Values.java:18:42:18:51 | 0b1_0101_0 | 42 | +| constants/Values.java:19:29:19:31 | '*' | 42 | | constants/Values.java:25:20:25:27 | (...)... | 42 | | constants/Values.java:26:25:26:33 | (...)... | 42 | | constants/Values.java:27:32:27:43 | (...)... | -42 | @@ -17,6 +18,7 @@ | constants/Values.java:30:32:30:41 | (...)... | -42 | | constants/Values.java:31:32:31:44 | (...)... | 42 | | constants/Values.java:32:32:32:44 | (...)... | -32768 | +| constants/Values.java:33:36:33:44 | (...)... | 42 | | constants/Values.java:36:26:36:28 | +... | 42 | | constants/Values.java:39:27:39:29 | -... | -42 | | constants/Values.java:43:27:43:28 | ~... | -1 | diff --git a/java/ql/test/library-tests/literals/charLiterals/CharLiterals.java b/java/ql/test/library-tests/literals/charLiterals/CharLiterals.java index 51274f9899e2..8640a50b06c4 100644 --- a/java/ql/test/library-tests/literals/charLiterals/CharLiterals.java +++ b/java/ql/test/library-tests/literals/charLiterals/CharLiterals.java @@ -13,6 +13,7 @@ public class CharLiterals { '\\', '\'', '\123', // octal escape sequence for 'S' + // CodeQL uses U+FFFD for unpaired surrogates, see https://github.com/github/codeql/issues/6611 '\uD800', // high surrogate '\uDC00', // low surrogate // Using Unicode escapes (which are handled during pre-processing) diff --git a/java/ql/test/library-tests/literals/charLiterals/charLiterals.expected b/java/ql/test/library-tests/literals/charLiterals/charLiterals.expected index 987f7c17fe4d..28fae87a6365 100644 --- a/java/ql/test/library-tests/literals/charLiterals/charLiterals.expected +++ b/java/ql/test/library-tests/literals/charLiterals/charLiterals.expected @@ -1,20 +1,20 @@ -| CharLiterals.java:5:3:5:5 | 'a' | a | -| CharLiterals.java:6:3:6:10 | '\\u0061' | a | -| CharLiterals.java:7:3:7:10 | '\\u0000' | \u0000 | -| CharLiterals.java:8:3:8:10 | '\\uFFFF' | \uffff | -| CharLiterals.java:9:3:9:10 | '\\ufFfF' | \uffff | -| CharLiterals.java:10:3:10:6 | '\\0' | \u0000 | -| CharLiterals.java:11:3:11:6 | '\\n' | \n | -| CharLiterals.java:12:3:12:5 | '"' | " | -| CharLiterals.java:13:3:13:6 | '\\\\' | \\ | -| CharLiterals.java:14:3:14:6 | '\\'' | ' | -| CharLiterals.java:15:3:15:8 | '\\123' | S | -| CharLiterals.java:16:3:16:10 | '\\uD800' | \ufffd | -| CharLiterals.java:17:3:17:10 | '\\uDC00' | \ufffd | -| CharLiterals.java:19:3:19:16 | '\\u005C\\u005C' | \\ | -| CharLiterals.java:20:3:20:16 | '\\u005C\\u0027' | ' | -| CharLiterals.java:21:8:21:15 | 7a\\u0027 | a | -| CharLiterals.java:26:4:26:6 | 'a' | a | -| CharLiterals.java:27:4:27:6 | 'a' | a | -| CharLiterals.java:32:3:32:5 | 'a' | a | -| CharLiterals.java:32:9:32:11 | 'b' | b | +| CharLiterals.java:5:3:5:5 | 'a' | a | 97 | +| CharLiterals.java:6:3:6:10 | '\\u0061' | a | 97 | +| CharLiterals.java:7:3:7:10 | '\\u0000' | \u0000 | 0 | +| CharLiterals.java:8:3:8:10 | '\\uFFFF' | \uffff | 65535 | +| CharLiterals.java:9:3:9:10 | '\\ufFfF' | \uffff | 65535 | +| CharLiterals.java:10:3:10:6 | '\\0' | \u0000 | 0 | +| CharLiterals.java:11:3:11:6 | '\\n' | \n | 10 | +| CharLiterals.java:12:3:12:5 | '"' | " | 34 | +| CharLiterals.java:13:3:13:6 | '\\\\' | \\ | 92 | +| CharLiterals.java:14:3:14:6 | '\\'' | ' | 39 | +| CharLiterals.java:15:3:15:8 | '\\123' | S | 83 | +| CharLiterals.java:17:3:17:10 | '\\uD800' | \ufffd | 65533 | +| CharLiterals.java:18:3:18:10 | '\\uDC00' | \ufffd | 65533 | +| CharLiterals.java:20:3:20:16 | '\\u005C\\u005C' | \\ | 92 | +| CharLiterals.java:21:3:21:16 | '\\u005C\\u0027' | ' | 39 | +| CharLiterals.java:22:8:22:15 | 7a\\u0027 | a | 97 | +| CharLiterals.java:27:4:27:6 | 'a' | a | 97 | +| CharLiterals.java:28:4:28:6 | 'a' | a | 97 | +| CharLiterals.java:33:3:33:5 | 'a' | a | 97 | +| CharLiterals.java:33:9:33:11 | 'b' | b | 98 | diff --git a/java/ql/test/library-tests/literals/charLiterals/charLiterals.ql b/java/ql/test/library-tests/literals/charLiterals/charLiterals.ql index 104e2c03dc4e..9a374e644573 100644 --- a/java/ql/test/library-tests/literals/charLiterals/charLiterals.ql +++ b/java/ql/test/library-tests/literals/charLiterals/charLiterals.ql @@ -1,4 +1,4 @@ import semmle.code.java.Expr from CharacterLiteral lit -select lit, lit.getValue() +select lit, lit.getValue(), lit.getCodePointValue() diff --git a/java/ql/test/library-tests/literals/stringLiterals/StringLiterals.java b/java/ql/test/library-tests/literals/stringLiterals/StringLiterals.java index 9c0c55b12e5f..f40f57361f35 100644 --- a/java/ql/test/library-tests/literals/stringLiterals/StringLiterals.java +++ b/java/ql/test/library-tests/literals/stringLiterals/StringLiterals.java @@ -24,6 +24,7 @@ public class StringLiterals { "\uD800\uDC00", // surrogate pair "\uDBFF\uDFFF", // U+10FFFF // Unpaired surrogates + // CodeQL uses U+FFFD for them, see https://github.com/github/codeql/issues/6611 "\uD800", "\uDC00", "hello\uD800hello\uDC00world", // malformed surrogates diff --git a/java/ql/test/library-tests/literals/stringLiterals/stringLiterals.expected b/java/ql/test/library-tests/literals/stringLiterals/stringLiterals.expected index bf91a56e7233..e08d7778e69b 100644 --- a/java/ql/test/library-tests/literals/stringLiterals/stringLiterals.expected +++ b/java/ql/test/library-tests/literals/stringLiterals/stringLiterals.expected @@ -17,32 +17,32 @@ | StringLiterals.java:23:3:23:18 | "\\uaBcDeF\\u0aB1" | \uabcdeF\u0ab1 | \uabcdeF\u0ab1 | | | StringLiterals.java:24:3:24:16 | "\\uD800\\uDC00" | \ud800\udc00 | \ud800\udc00 | | | StringLiterals.java:25:3:25:16 | "\\uDBFF\\uDFFF" | \udbff\udfff | \udbff\udfff | | -| StringLiterals.java:27:3:27:10 | "\\uD800" | \ufffd | \ufffd | | -| StringLiterals.java:28:3:28:10 | "\\uDC00" | \ufffd | \ufffd | | -| StringLiterals.java:29:3:29:31 | "hello\\uD800hello\\uDC00world" | hello\ufffdhello\ufffdworld | hello\ufffdhello\ufffdworld | | -| StringLiterals.java:31:3:31:16 | "\\u005C\\u0022" | " | " | | -| StringLiterals.java:32:8:32:20 | 2\\u0061\\u0022 | a | a | | -| StringLiterals.java:37:3:39:5 | """ \t \n\t\ttest "text" and escaped \\u0022\n\t\t""" | test "text" and escaped "\n | test "text" and escaped "\n | text-block | -| StringLiterals.java:41:3:43:5 | """\n\t\t\tindented\n\t\t""" | \tindented\n | \tindented\n | text-block | -| StringLiterals.java:44:3:46:5 | """\n\tno indentation last line\n\t\t""" | no indentation last line\n | no indentation last line\n | text-block | -| StringLiterals.java:47:3:49:7 | """\n\tindentation last line\n\t\t\\s""" | indentation last line\n\t | indentation last line\n\t | text-block | -| StringLiterals.java:50:3:52:6 | """\n\t\t\tnot-indented\n\t\t\t""" | not-indented\n | not-indented\n | text-block | -| StringLiterals.java:53:3:55:4 | """\n\t\tindented\n\t""" | \tindented\n | \tindented\n | text-block | -| StringLiterals.java:56:4:58:5 | """\n\t\tnot-indented\n\t\t""" | not-indented\n | not-indented\n | text-block | -| StringLiterals.java:59:3:62:6 | """\n\t\t spaces (only single space is trimmed)\n\t\t\ttab\n\t\t\t""" | spaces (only single space is trimmed)\ntab\n | spaces (only single space is trimmed)\ntab\n | text-block | -| StringLiterals.java:63:3:64:22 | """\n\t\t\tend on same line""" | end on same line | end on same line | text-block | -| StringLiterals.java:65:3:68:5 | """\n\t\ttrailing spaces ignored: \t \n\t\tnot ignored: \t \\s\n\t\t""" | trailing spaces ignored:\nnot ignored: \t \n | trailing spaces ignored:\nnot ignored: \t \n | text-block | -| StringLiterals.java:69:3:70:18 | """\n\t\t3 quotes:""\\"""" | 3 quotes:""" | 3 quotes:""" | text-block | -| StringLiterals.java:71:3:74:5 | """\n\t\tline \\\n\t\tcontinuation \\\n\t\t""" | line continuation | line continuation | text-block | -| StringLiterals.java:75:3:79:5 | """\n\t\tExplicit line breaks:\\n\n\t\t\\r\\n\n\t\t\\r\n\t\t""" | Explicit line breaks:\n\n\r\n\n\r\n | Explicit line breaks:\n\n\r\n\n\r\n | text-block | -| StringLiterals.java:82:10:84:16 | 2"\\u0022\n\t\ttest\n\t\t\\u0022\\uu0022" | test\n | test\n | | -| StringLiterals.java:90:3:90:19 | "hello" + "world" | helloworld | helloworld | | -| StringLiterals.java:91:3:92:20 | """\n\t\thello""" + "world" | helloworld | helloworld | text-block | -| StringLiterals.java:93:10:93:12 | "a" | a | a | | -| StringLiterals.java:94:3:94:5 | "a" | a | a | | +| StringLiterals.java:28:3:28:10 | "\\uD800" | \ufffd | \ufffd | | +| StringLiterals.java:29:3:29:10 | "\\uDC00" | \ufffd | \ufffd | | +| StringLiterals.java:30:3:30:31 | "hello\\uD800hello\\uDC00world" | hello\ufffdhello\ufffdworld | hello\ufffdhello\ufffdworld | | +| StringLiterals.java:32:3:32:16 | "\\u005C\\u0022" | " | " | | +| StringLiterals.java:33:8:33:20 | 2\\u0061\\u0022 | a | a | | +| StringLiterals.java:38:3:40:5 | """ \t \n\t\ttest "text" and escaped \\u0022\n\t\t""" | test "text" and escaped "\n | test "text" and escaped "\n | text-block | +| StringLiterals.java:42:3:44:5 | """\n\t\t\tindented\n\t\t""" | \tindented\n | \tindented\n | text-block | +| StringLiterals.java:45:3:47:5 | """\n\tno indentation last line\n\t\t""" | no indentation last line\n | no indentation last line\n | text-block | +| StringLiterals.java:48:3:50:7 | """\n\tindentation last line\n\t\t\\s""" | indentation last line\n\t | indentation last line\n\t | text-block | +| StringLiterals.java:51:3:53:6 | """\n\t\t\tnot-indented\n\t\t\t""" | not-indented\n | not-indented\n | text-block | +| StringLiterals.java:54:3:56:4 | """\n\t\tindented\n\t""" | \tindented\n | \tindented\n | text-block | +| StringLiterals.java:57:4:59:5 | """\n\t\tnot-indented\n\t\t""" | not-indented\n | not-indented\n | text-block | +| StringLiterals.java:60:3:63:6 | """\n\t\t spaces (only single space is trimmed)\n\t\t\ttab\n\t\t\t""" | spaces (only single space is trimmed)\ntab\n | spaces (only single space is trimmed)\ntab\n | text-block | +| StringLiterals.java:64:3:65:22 | """\n\t\t\tend on same line""" | end on same line | end on same line | text-block | +| StringLiterals.java:66:3:69:5 | """\n\t\ttrailing spaces ignored: \t \n\t\tnot ignored: \t \\s\n\t\t""" | trailing spaces ignored:\nnot ignored: \t \n | trailing spaces ignored:\nnot ignored: \t \n | text-block | +| StringLiterals.java:70:3:71:18 | """\n\t\t3 quotes:""\\"""" | 3 quotes:""" | 3 quotes:""" | text-block | +| StringLiterals.java:72:3:75:5 | """\n\t\tline \\\n\t\tcontinuation \\\n\t\t""" | line continuation | line continuation | text-block | +| StringLiterals.java:76:3:80:5 | """\n\t\tExplicit line breaks:\\n\n\t\t\\r\\n\n\t\t\\r\n\t\t""" | Explicit line breaks:\n\n\r\n\n\r\n | Explicit line breaks:\n\n\r\n\n\r\n | text-block | +| StringLiterals.java:83:10:85:16 | 2"\\u0022\n\t\ttest\n\t\t\\u0022\\uu0022" | test\n | test\n | | +| StringLiterals.java:91:3:91:19 | "hello" + "world" | helloworld | helloworld | | +| StringLiterals.java:92:3:93:20 | """\n\t\thello""" + "world" | helloworld | helloworld | text-block | +| StringLiterals.java:94:10:94:12 | "a" | a | a | | | StringLiterals.java:95:3:95:5 | "a" | a | a | | -| StringLiterals.java:96:7:96:9 | "a" | a | a | | -| StringLiterals.java:97:3:97:5 | "a" | a | a | | -| StringLiterals.java:98:10:98:12 | "a" | a | a | | -| StringLiterals.java:99:3:99:5 | "a" | a | a | | -| StringLiterals.java:100:9:100:11 | "a" | a | a | | +| StringLiterals.java:96:3:96:5 | "a" | a | a | | +| StringLiterals.java:97:7:97:9 | "a" | a | a | | +| StringLiterals.java:98:3:98:5 | "a" | a | a | | +| StringLiterals.java:99:10:99:12 | "a" | a | a | | +| StringLiterals.java:100:3:100:5 | "a" | a | a | | +| StringLiterals.java:101:9:101:11 | "a" | a | a | | From 4f59886a658b6dc80142285c90e0c98cf404ba79 Mon Sep 17 00:00:00 2001 From: Marcono1234 Date: Fri, 29 Oct 2021 14:24:30 +0200 Subject: [PATCH 2/4] Java: Simplify CompileTimeConstantExpr.getIntValue() The changed code previously also only covered IntegerLiteral: - Restricted to Literal - Integral type - != "long" - != "char" So the only class left which matches all of these is IntegerLiteral. --- java/ql/lib/semmle/code/java/Expr.qll | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/java/ql/lib/semmle/code/java/Expr.qll b/java/ql/lib/semmle/code/java/Expr.qll index 19f41772ff68..719ad8da4ab2 100755 --- a/java/ql/lib/semmle/code/java/Expr.qll +++ b/java/ql/lib/semmle/code/java/Expr.qll @@ -304,11 +304,7 @@ class CompileTimeConstantExpr extends Expr { int getIntValue() { exists(IntegralType t | this.getType() = t | t.getName().toLowerCase() != "long") and ( - exists(string lit | lit = this.(Literal).getValue() | - // Don't parse `char` literal as int, instead get its code point value (see below) - not this instanceof CharacterLiteral and - result = lit.toInt() - ) + result = this.(IntegerLiteral).getIntValue() or result = this.(CharacterLiteral).getCodePointValue() or From fe5115169fcc1ca99965d8a622b1507ddc9c2569 Mon Sep 17 00:00:00 2001 From: Marcono1234 Date: Fri, 29 Oct 2021 14:56:07 +0200 Subject: [PATCH 3/4] Java: Describe `CharacterLiteral.getValue()` behavior for surrogates --- java/ql/lib/semmle/code/java/Expr.qll | 3 +++ 1 file changed, 3 insertions(+) diff --git a/java/ql/lib/semmle/code/java/Expr.qll b/java/ql/lib/semmle/code/java/Expr.qll index 719ad8da4ab2..a716a1d12849 100755 --- a/java/ql/lib/semmle/code/java/Expr.qll +++ b/java/ql/lib/semmle/code/java/Expr.qll @@ -720,6 +720,9 @@ class CharacterLiteral extends Literal, @characterliteral { /** * Gets a string which consists of the single character represented by * this literal. + * + * Unicode surrogate characters (U+D800 to U+DFFF) have the replacement character + * U+FFFD as result instead. */ override string getValue() { result = super.getValue() } From 301a90759670b784d8d6d587198e8ce67db41236 Mon Sep 17 00:00:00 2001 From: Anders Schack-Mulligen Date: Mon, 1 Nov 2021 09:36:09 +0100 Subject: [PATCH 4/4] Update java/ql/lib/semmle/code/java/Expr.qll --- java/ql/lib/semmle/code/java/Expr.qll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/ql/lib/semmle/code/java/Expr.qll b/java/ql/lib/semmle/code/java/Expr.qll index a716a1d12849..2d6fc9c16712 100755 --- a/java/ql/lib/semmle/code/java/Expr.qll +++ b/java/ql/lib/semmle/code/java/Expr.qll @@ -731,7 +731,7 @@ class CharacterLiteral extends Literal, @characterliteral { * this literal. The result is the same as if the Java code had cast * the character to an `int`. */ - int getCodePointValue() { result = any(int i | i.toUnicode() = getValue()) } + int getCodePointValue() { result.toUnicode() = this.getValue() } } /**