From 6094a70362010d818a143d7ca7ccd2976e18dd45 Mon Sep 17 00:00:00 2001 From: TomShawn <41534398+TomShawn@users.noreply.github.com> Date: Wed, 13 May 2020 16:44:12 +0800 Subject: [PATCH 1/2] reference: update documents for new collation (#2350) --- TOC.md | 2 +- .../tidb-server/configuration-file.md | 6 + reference/mysql-compatibility.md | 10 - reference/sql/character-set.md | 258 ------------ reference/sql/characterset-and-collation.md | 380 ++++++++++++++++++ reference/sql/statements/alter-database.md | 2 +- reference/sql/statements/create-database.md | 2 +- reference/sql/statements/set-names.md | 2 +- .../system-databases/information-schema.md | 2 +- reference/tools/syncer.md | 4 +- 10 files changed, 393 insertions(+), 275 deletions(-) delete mode 100644 reference/sql/character-set.md create mode 100644 reference/sql/characterset-and-collation.md diff --git a/TOC.md b/TOC.md index 50cc9370016d4..0f7a437a6e90e 100644 --- a/TOC.md +++ b/TOC.md @@ -250,7 +250,7 @@ - [Constraints](/reference/sql/constraints.md) - [Generated Columns](/reference/sql/generated-columns.md) - [Partitioning](/reference/sql/partitioning.md) - - [Character Set](/reference/sql/character-set.md) + - [Character Set and Collation](/reference/sql/characterset-and-collation.md) - [SQL Mode](/reference/sql/sql-mode.md) - [SQL Diagnosis](/reference/system-databases/sql-diagnosis.md) - [Views](/reference/sql/views.md) diff --git a/reference/configuration/tidb-server/configuration-file.md b/reference/configuration/tidb-server/configuration-file.md index e0be52cf1f195..08998ccb91709 100644 --- a/reference/configuration/tidb-server/configuration-file.md +++ b/reference/configuration/tidb-server/configuration-file.md @@ -112,6 +112,12 @@ The TiDB configuration file supports more options than command-line parameters. - Default value: [] - The list is empty by default. This means that there are no bad tables that need to be repaired. +### `new_collations_enabled_on_first_bootstrap` + +- Enables or disables the new collation support. +- Default value: `false` +- Note: This configuration takes effect only for the TiDB cluster that is first initialized. After the initialization, you cannot use this configuration item to enable or disable the new collation support. When a TiDB cluster is upgraded to v4.0, because the cluster has been initialized before, both `true` and `false` values of this configuration item are taken as `false`. + ### `max-server-connections` - The maximum number of concurrent client connections allowed in TiDB. It is used to control resources. diff --git a/reference/mysql-compatibility.md b/reference/mysql-compatibility.md index 9db9264c612f1..eb495869edb71 100644 --- a/reference/mysql-compatibility.md +++ b/reference/mysql-compatibility.md @@ -25,7 +25,6 @@ However, TiDB does not support some of MySQL features or behaves differently fro + `FOREIGN KEY` constraints + `FULLTEXT`/`SPATIAL` functions and indexes + Character sets other than `utf8`, `utf8mb4`, `ascii`, `latin1` and `binary` -+ Collations other than `BINARY` + Add/drop primary key + SYS schema + Optimizer trace @@ -121,7 +120,6 @@ In TiDB DDL does not block reads or writes to tables while in operation. However - Does not support lossy changes, such as from `BIGINT` to `INTEGER` or `VARCHAR(255)` to `VARCHAR(10)`. - Does not support modifying the precision of `DECIMAL` data types. - Does not support changing the `UNSIGNED` attribute. - - Only supports changing the `CHARACTER SET` attribute from `utf8` to `utf8mb4`. + `LOCK [=] {DEFAULT|NONE|SHARED|EXCLUSIVE}`: the syntax is supported, but is not applicable to TiDB. All DDL changes that are supported do not lock the table. + `ALGORITHM [=] {DEFAULT|INSTANT|INPLACE|COPY}`: the syntax for `ALGORITHM=INSTANT` and `ALGORITHM=INPLACE` is fully supported, but it works differently from MySQL because some operations that are `INPLACE` in MySQL are `INSTANT` in TiDB. The syntax `ALGORITHM=COPY` is not applicable to TIDB and returns a warning. + Multiple operations cannot be completed in a single `ALTER TABLE` statement. For example, it's not possible to add multiple columns or indexes in a single statement. @@ -269,14 +267,6 @@ Because they are built-in, named time zones in TiDB might behave slightly differ It is not recommended to unset the `NO_ZERO_DATE` and `NO_ZERO_IN_DATE` SQL modes, which are enabled by default in TiDB as in MySQL. While TiDB supports operating with these modes disabled, the TiKV coprocessor does not. Executing certain statements that push down date and time processing functions to TiKV might result in a statement error. -#### Handling of space at the end of string line - -Currently, when inserting data, TiDB keeps the space at the end of the line for the `VARCHAR` type, and truncate the space for the `CHAR` type. In case there is no index, TiDB behaves exactly the same as MySQL. - -If there is a `UNIQUE` index on the `VARCHAR` data, MySQL truncates the space at the end of the `VARCHAR` line before determining whether the data is duplicated, which is similar to the processing of the `CHAR` type, while TiDB keeps the space. - -When making a comparison, MySQL first truncates the constant and the space at the end of the column, while TiDB keeps them to enable exact comparison. - ### Type system differences The following column types are supported by MySQL, but not by TiDB: diff --git a/reference/sql/character-set.md b/reference/sql/character-set.md deleted file mode 100644 index dcaebba2a2599..0000000000000 --- a/reference/sql/character-set.md +++ /dev/null @@ -1,258 +0,0 @@ ---- -title: Character Set Support -summary: Learn about the supported character sets in TiDB. -category: reference - ---- - -# Character Set Support - -A character set is a set of symbols and encodings. A collation is a set of rules for comparing characters in a character set. - -Currently, TiDB supports the following character sets: - -```sql -mysql> SHOW CHARACTER SET; -+---------|---------------|-------------------|--------+ -| Charset | Description | Default collation | Maxlen | -+---------|---------------|-------------------|--------+ -| utf8 | UTF-8 Unicode | utf8_bin | 3 | -| utf8mb4 | UTF-8 Unicode | utf8mb4_bin | 4 | -| ascii | US ASCII | ascii_bin | 1 | -| latin1 | Latin1 | latin1_bin | 1 | -| binary | binary | binary | 1 | -+---------|---------------|-------------------|--------+ -5 rows in set (0.00 sec) -``` - -> **Note:** -> -> + In `TiDB`, `utf8` is treated as `utf8mb4`. -> + Each character set corresponds to only one default collation. - -## Collation support - -TiDB only supports binary collations. This means that unlike MySQL, in TiDB string comparisons are both case sensitive and accent sensitive: - -```sql -mysql> SELECT * FROM information_schema.collations; -+----------------+--------------------+------+------------+-------------+---------+ -| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT | IS_COMPILED | SORTLEN | -+----------------+--------------------+------+------------+-------------+---------+ -| utf8mb4_bin | utf8mb4 | 46 | Yes | Yes | 1 | -| latin1_bin | latin1 | 47 | Yes | Yes | 1 | -| binary | binary | 63 | Yes | Yes | 1 | -| ascii_bin | ascii | 65 | Yes | Yes | 1 | -| utf8_bin | utf8 | 83 | Yes | Yes | 1 | -+----------------+--------------------+------+------------+-------------+---------+ -5 rows in set (0.00 sec) - -mysql> SHOW COLLATION WHERE Charset = 'utf8mb4'; -+-------------+---------+------+---------+----------+---------+ -| Collation | Charset | Id | Default | Compiled | Sortlen | -+-------------+---------+------+---------+----------+---------+ -| utf8mb4_bin | utf8mb4 | 46 | Yes | Yes | 1 | -+-------------+---------+------+---------+----------+---------+ -1 row in set (0.00 sec) -``` - -For compatibility with MySQL, TiDB will allow other collation names to be used: - -```sql -mysql> CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(10)) COLLATE utf8mb4_unicode_520_ci; -Query OK, 0 rows affected (0.00 sec) - -mysql> INSERT INTO t1 VALUES (1, 'a'); -Query OK, 1 row affected (0.00 sec) - -mysql> SELECT * FROM t1 WHERE b = 'a'; -+---+------+ -| a | b | -+---+------+ -| 1 | a | -+---+------+ -1 row in set (0.00 sec) - -mysql> SELECT * FROM t1 WHERE b = 'A'; -Empty set (0.00 sec) - -mysql> SHOW CREATE TABLE t1\G -*************************** 1. row *************************** - Table: t1 -Create Table: CREATE TABLE `t1` ( - `a` int(11) NOT NULL AUTO_INCREMENT, - `b` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin DEFAULT NULL, - PRIMARY KEY (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci AUTO_INCREMENT=30002 -1 row in set (0.00 sec) -``` - -## Cluster character set and collation - -Not supported yet. - -## Database character set and collation - -Each database has a character set and a collation. You can use the following statements to specify the database character set and collation: - -```sql -CREATE DATABASE db_name - [[DEFAULT] CHARACTER SET charset_name] - [[DEFAULT] COLLATE collation_name] - -ALTER DATABASE db_name - [[DEFAULT] CHARACTER SET charset_name] - [[DEFAULT] COLLATE collation_name] -``` - -`DATABASE` can be replaced with `SCHEMA` here. - -Different databases can use different character sets and collations. Use the `character_set_database` and `collation_database` to see the character set and collation of the current database: - -```sql -mysql> create schema test1 character set utf8 COLLATE uft8_general_ci; -Query OK, 0 rows affected (0.09 sec) - -mysql> use test1; -Database changed -mysql> SELECT @@character_set_database, @@collation_database; -+--------------------------|----------------------+ -| @@character_set_database | @@collation_database | -+--------------------------|----------------------+ -| utf8 | uft8_general_ci | -+--------------------------|----------------------+ -1 row in set (0.00 sec) - -mysql> create schema test2 character set latin1 COLLATE latin1_general_ci; -Query OK, 0 rows affected (0.09 sec) - -mysql> use test2; -Database changed -mysql> SELECT @@character_set_database, @@collation_database; -+--------------------------|----------------------+ -| @@character_set_database | @@collation_database | -+--------------------------|----------------------+ -| latin1 | latin1_general_ci | -+--------------------------|----------------------+ -1 row in set (0.00 sec) -``` - -You can also see the two values in INFORMATION_SCHEMA: - -```sql -SELECT DEFAULT_CHARACTER_SET_NAME, DEFAULT_COLLATION_NAME -FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = 'db_name'; -``` - -## Table character set and collation - -You can use the following statement to specify the character set and collation for tables: - -```sql -CREATE TABLE tbl_name (column_list) - [[DEFAULT] CHARACTER SET charset_name] - [COLLATE collation_name]] - -ALTER TABLE tbl_name - [[DEFAULT] CHARACTER SET charset_name] - [COLLATE collation_name] -``` - -For example: - -```sql -mysql> CREATE TABLE t1(a int) CHARACTER SET utf8 COLLATE utf8_general_ci; -Query OK, 0 rows affected (0.08 sec) -``` - -The database character set and collation are used as the default values for table definitions if the table character set and collation are not specified in individual column definitions. - -## Column character set and collation - -See the following table for the character set and collation syntax for columns: - -```sql -col_name {CHAR | VARCHAR | TEXT} (col_length) - [CHARACTER SET charset_name] - [COLLATE collation_name] - -col_name {ENUM | SET} (val_list) - [CHARACTER SET charset_name] - [COLLATE collation_name] -``` - -The table character set and collation are used as the default values for column definitions if the column character set and collation are not specified in individual column definitions. - -## String character sets and collation - -Each character literal in a string has a character set and a collation. When you use a string, this option is available: - -{{< copyable "sql" >}} - -```sql -[_charset_name]'string' [COLLATE collation_name] -``` - -Example: - -```sql -SELECT 'string'; -SELECT _latin1'string'; -SELECT _latin1'string' COLLATE latin1_danish_ci; -``` - -Rules: - -+ Rule 1: If you specify `CHARACTER SET charset_name` and `COLLATE collation_name`, then `CHARACTER SET charset_name` and `COLLATE collation_name` are used directly. -+ Rule 2: If you specify `CHARACTER SET charset_name` but do not specify `COLLATE collation_name`, `CHARACTER SET charset_name` and the default collation of `CHARACTER SET charset_name` are used. -+ Rule 3: If you specify neither `CHARACTER SET charset_name` nor `COLLATE collation_name`, the character set and collation given by the system variables `character_set_connection` and `collation_connection` are used. - -## Connection character sets and collations - -+ The server character set and collation are the values of the `character_set_server` and `collation_server` system variables. - -+ The character set and collation of the default database are the values of the `character_set_database` and `collation_database` system variables. You can use `character_set_connection` and `collation_connection` to specify the character set and collation for each connection. The `character_set_client` variable is to set the client character set. Before returning the result, the `character_set_results` system variable indicates the character set in which the server returns query results to the client, including the metadata of the result. - -You can use the following statement to specify a particular collation that is related to the client: - -+ `SET NAMES 'charset_name' [COLLATE 'collation_name']` - - `SET NAMES` indicates what character set the client will use to send SQL statements to the server. `SET NAMES utf8` indicates that all the requests from the client use utf8, as well as the results from the server. - - The `SET NAMES 'charset_name'` statement is equivalent to the following statement combination: - - ```sql - SET character_set_client = charset_name; - SET character_set_results = charset_name; - SET character_set_connection = charset_name; - ``` - - `COLLATE` is optional, if absent, the default collation of the `charset_name` is used. - -+ `SET CHARACTER SET 'charset_name'` - - Similar to `SET NAMES`, the `SET NAMES 'charset_name'` statement is equivalent to the following statement combination: - - ```sql - SET character_set_client = charset_name; - SET character_set_results = charset_name; - SET collation_connection = @@collation_database; - ``` - -## Optimization levels of character sets and collations - -String => Column => Table => Database => Server => Cluster - -## General rules on selecting character sets and collation - -+ Rule 1: If you specify `CHARACTER SET charset_name` and `COLLATE collation_name`, then `CHARACTER SET charset_name` and `COLLATE collation_name` are used directly. -+ Rule 2: If you specify `CHARACTER SET charset_name` and do not specify `COLLATE collation_name`, then `CHARACTER SET charset_name` and the default comparison collation of `CHARACTER SET charset_name` are used. -+ Rule 3: If you specify neither `CHARACTER SET charset_name` nor `COLLATE collation_name`, the character set and collation with higher optimization levels are used. - -## Validity check of characters - -For the specified `utf8` or `utf8mb4` character set, TiDB only supports the valid `utf8` character, and reports the `incorrect utf8 value` error when the character is invalid. This validity check of characters in TiDB is compatible with MySQL 8.0 but incompatible with MySQL 5.7 or earlier versions. - -To disable this error reporting, use `set @@tidb_skip_utf8_check=1;` to skip the character check. - -For more information, see [Connection Character Sets and Collations in MySQL](https://dev.mysql.com/doc/refman/5.7/en/charset-connection.html). diff --git a/reference/sql/characterset-and-collation.md b/reference/sql/characterset-and-collation.md new file mode 100644 index 0000000000000..858685e1d719b --- /dev/null +++ b/reference/sql/characterset-and-collation.md @@ -0,0 +1,380 @@ +--- +title: Character Set and Collation +summary: Learn about the supported character sets and collations in TiDB. +category: reference +aliases: ['/docs/dev/reference/sql/character-set/'] +--- + +# Character Set and Collation + +A character set is a set of symbols and encodings. A collation is a set of rules for comparing characters in a character set. + +Currently, TiDB supports the following character sets: + +{{< copyable "sql" >}} + +```sql +SHOW CHARACTER SET; +``` + +```sql ++---------|---------------|-------------------|--------+ +| Charset | Description | Default collation | Maxlen | ++---------|---------------|-------------------|--------+ +| utf8 | UTF-8 Unicode | utf8_bin | 3 | +| utf8mb4 | UTF-8 Unicode | utf8mb4_bin | 4 | +| ascii | US ASCII | ascii_bin | 1 | +| latin1 | Latin1 | latin1_bin | 1 | +| binary | binary | binary | 1 | ++---------|---------------|-------------------|--------+ +5 rows in set (0.00 sec) +``` + +> **Note:** +> +> Each character set might correspond to multiple collations, but by default each character set corresponds to only one collation. + +You can use the following statement to view the collations (under the [new framework for collations](#new-framework-for-collations)) that corresponds to the character set. + +{{< copyable "sql" >}} + +```sql +SHOW COLLATION WHERE Charset = 'utf8mb4'; +``` + +```sql ++--------------------+---------+------+---------+----------+---------+ +| Collation | Charset | Id | Default | Compiled | Sortlen | ++--------------------+---------+------+---------+----------+---------+ +| utf8mb4_bin | utf8mb4 | 46 | Yes | Yes | 1 | +| utf8mb4_general_ci | utf8mb4 | 45 | | Yes | 1 | ++--------------------+---------+------+---------+----------+---------+ +2 rows in set (0.00 sec) +``` + +## Cluster character set and collation + +Not supported yet. + +## Database character set and collation + +Each database has a character set and a collation. You can use the following statements to specify the database character set and collation: + +```sql +CREATE DATABASE db_name + [[DEFAULT] CHARACTER SET charset_name] + [[DEFAULT] COLLATE collation_name] + +ALTER DATABASE db_name + [[DEFAULT] CHARACTER SET charset_name] + [[DEFAULT] COLLATE collation_name] +``` + +`DATABASE` can be replaced with `SCHEMA` here. + +Different databases can use different character sets and collations. Use the `character_set_database` and `collation_database` to see the character set and collation of the current database: + +{{< copyable "sql" >}} + +```sql +create schema test1 character set utf8mb4 COLLATE uft8mb4_general_ci; +``` + +```sql +Query OK, 0 rows affected (0.09 sec) +``` + +{{< copyable "sql" >}} + +```sql +use test1; +``` + +```sql +Database changed +``` + +{{< copyable "sql" >}} + +```sql +SELECT @@character_set_database, @@collation_database; +``` + +```sql ++--------------------------|----------------------+ +| @@character_set_database | @@collation_database | ++--------------------------|----------------------+ +| utf8mb4 | uft8mb4_general_ci | ++--------------------------|----------------------+ +1 row in set (0.00 sec) +``` + +{{< copyable "sql" >}} + +```sql +create schema test2 character set latin1 COLLATE latin1_bin; +``` + +```sql +Query OK, 0 rows affected (0.09 sec) +``` + +{{< copyable "sql" >}} + +```sql +use test2; +``` + +```sql +Database changed +``` + +{{< copyable "sql" >}} + +```sql +SELECT @@character_set_database, @@collation_database; +``` + +```sql ++--------------------------|----------------------+ +| @@character_set_database | @@collation_database | ++--------------------------|----------------------+ +| latin1 | latin1_bin | ++--------------------------|----------------------+ +1 row in set (0.00 sec) +``` + +You can also see the two values in `INFORMATION_SCHEMA`: + +{{< copyable "sql" >}} + +```sql +SELECT DEFAULT_CHARACTER_SET_NAME, DEFAULT_COLLATION_NAME +FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = 'db_name'; +``` + +## Table character set and collation + +You can use the following statement to specify the character set and collation for tables: + +```sql +CREATE TABLE tbl_name (column_list) + [[DEFAULT] CHARACTER SET charset_name] + [COLLATE collation_name]] + +ALTER TABLE tbl_name + [[DEFAULT] CHARACTER SET charset_name] + [COLLATE collation_name] +``` + +For example: + +{{< copyable "sql" >}} + +```sql +CREATE TABLE t1(a int) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +``` + +```sql +Query OK, 0 rows affected (0.08 sec) +``` + +If the table character set and collation are not specified, the database character set and collation are used as their default values. + +## Column character set and collation + +You can use the following statement to specify the character set and collation for columns: + +```sql +col_name {CHAR | VARCHAR | TEXT} (col_length) + [CHARACTER SET charset_name] + [COLLATE collation_name] + +col_name {ENUM | SET} (val_list) + [CHARACTER SET charset_name] + [COLLATE collation_name] +``` + +If the column character set and collation are not specified, the table character set and collation are used as their default values. + +## String character sets and collation + +Each string corresponds to a character set and a collation. When you use a string, this option is available: + +{{< copyable "sql" >}} + +```sql +[_charset_name]'string' [COLLATE collation_name] +``` + +Example: + +{{< copyable "sql" >}} + +```sql +SELECT 'string'; +SELECT _utf8mb4'string'; +SELECT _utf8mb4'string' COLLATE utf8mb4_general_ci; +``` + +Rules: + ++ Rule 1: If you specify `CHARACTER SET charset_name` and `COLLATE collation_name`, then the `charset_name` character set and the `collation_name` collation are used directly. ++ Rule 2: If you specify `CHARACTER SET charset_name` but do not specify `COLLATE collation_name`, the `charset_name` character set and the default collation of `charset_name` are used. ++ Rule 3: If you specify neither `CHARACTER SET charset_name` nor `COLLATE collation_name`, the character set and collation given by the system variables `character_set_connection` and `collation_connection` are used. + +## Client connection character set and collation + ++ The server character set and collation are the values of the `character_set_server` and `collation_server` system variables. + ++ The character set and collation of the default database are the values of the `character_set_database` and `collation_database` system variables. + +You can use `character_set_connection` and `collation_connection` to specify the character set and collation for each connection. The `character_set_client` variable is to set the client character set. + +Before returning the result, the `character_set_results` system variable indicates the character set in which the server returns query results to the client, including the metadata of the result. + +You can use the following statement to set the character set and collation that is related to the client: + ++ `SET NAMES 'charset_name' [COLLATE 'collation_name']` + + `SET NAMES` indicates what character set the client will use to send SQL statements to the server. `SET NAMES utf8mb4` indicates that all the requests from the client use utf8mb4, as well as the results from the server. + + The `SET NAMES 'charset_name'` statement is equivalent to the following statement combination: + + ```sql + SET character_set_client = charset_name; + SET character_set_results = charset_name; + SET character_set_connection = charset_name; + ``` + + `COLLATE` is optional, if absent, the default collation of the `charset_name` is used. + ++ `SET CHARACTER SET 'charset_name'` + + Similar to `SET NAMES`, the `SET NAMES 'charset_name'` statement is equivalent to the following statement combination: + + ```sql + SET character_set_client = charset_name; + SET character_set_results = charset_name; + SET collation_connection = @@collation_database; + ``` + +## Optimization levels of character sets and collations + +String > Column > Table > Database > Server > Cluster + +## General rules on selecting character sets and collation + ++ Rule 1: If you specify `CHARACTER SET charset_name` and `COLLATE collation_name`, then the `charset_name` character set and the `collation_name` collation are used directly. ++ Rule 2: If you specify `CHARACTER SET charset_name` and do not specify `COLLATE collation_name`, then the `charset_name` character set and the default collation of `charset_name` are used. ++ Rule 3: If you specify neither `CHARACTER SET charset_name` nor `COLLATE collation_name`, the character set and collation with higher optimization levels are used. + +## Validity check of characters + +If the specified character set is `utf8` or `utf8mb4`, TiDB only supports the valid `utf8` characters. For invalid characters, TiDB reports the `incorrect utf8 value` error. This validity check of characters in TiDB is compatible with MySQL 8.0 but incompatible with MySQL 5.7 or earlier versions. + +To disable this error reporting, use `set @@tidb_skip_utf8_check=1;` to skip the character check. + +## Collation support framework + +The syntax support and semantic support for the collation are influenced by the [`new_collations_enabled_on_first_bootstrap`](/reference/configuration/tidb-server/configuration-file.md#new_collations_enabled_on_first_bootstrap) configuration item. The syntax support and semantic support are different. The former indicates that TiDB can parse and set collations. The latter indicates that TiDB can correctly use collations when comparing strings. + +Before v4.0, TiDB provides only the [old framework for collations](#old-framework-for-collations). In this framework, TiDB supports syntactically parsing most of the MySQL collations but semantically takes all collations as binary collations. + +Since v4.0, TiDB supports a [new framework for collations](#new-framework-for-collations). In this framework, TiDB semantically parses different collations and strictly follows the collations when comparing strings. + +### Old framework for collations + +Before v4.0, you can specify most of the MySQL collations in TiDB, and these collations are processed according to the default collations, which means that the byte order determines the character order. Different from MySQL, TiDB deletes the space at the end of the character according to the `PADDING` attribute of the collation before comparing characters, which causes the following behavior differences: + +{{< copyable "sql" >}} + +```sql +create table t(a varchar(20) charset utf8mb4 collate utf8mb4_general_ci primary key); +Query OK, 0 rows affected +insert into t values ('A'); +Query OK, 1 row affected +insert into t values ('a'); +Query OK, 1 row affected # In MySQL, because utf8mb4_general_ci is case-insensitive, the `Duplicate entry 'a'` error is reported. +insert into t1 values ('a '); +Query OK, 1 row affected # In MySQL, because comparison is performed after the spaces are filled in, the `Duplicate entry 'a '` error is returned. +``` + +### New framework for collations + +In TiDB 4.0, a complete framework for collations is introduced. This new framework supports semantically parsing collations and introduces the `new_collations_enabled_on_first_bootstrap` configuration item to decide whether to enable the new framework when a cluster is first initialized. If you initialize the cluster after the configuration item is enabled, you can check whether the new collation is enabled through the `new_collation_enabled` variable in the `mysql`.`tidb` table: + +{{< copyable "sql" >}} + +```sql +select VARIABLE_VALUE from mysql.tidb where VARIABLE_NAME='new_collation_enabled'; +``` + +```sql ++----------------+ +| VARIABLE_VALUE | ++----------------+ +| True | ++----------------+ +1 row in set (0.00 sec) +``` + +Under the new framework, TiDB support the `utf8_general_ci` and `utf8mb4_general_ci` collations which are compatible with MySQL. + +When `utf8_general_ci` or `utf8mb4_general_ci` is used, the string comparison is case-insensitive and accent-insensitive. At the same time, TiDB also corrects the collation's `PADDING` behavior: + +{{< copyable "sql" >}} + +```sql +create table t(a varchar(20) charset utf8mb4 collate utf8mb4_general_ci primary key); +Query OK, 0 rows affected (0.00 sec) +insert into t values ('A'); +Query OK, 1 row affected (0.00 sec) +insert into t values ('a'); +ERROR 1062 (23000): Duplicate entry 'a' for key 'PRIMARY' +insert into t values ('a '); +ERROR 1062 (23000): Duplicate entry 'a ' for key 'PRIMARY' +``` + +> **Note:** +> +> The implementation of padding in TiDB is different from that in MySQL. In MySQL, padding is implemented by filling in spaces. In TiDB, padding is implemented by cutting out the spaces at the end. The two approaches are the same in most cases. The only exception is when the end of the string contains characters that are less than spaces (0x20). For example, the result of `'a' < 'a\t'` in TiDB is `1`, but in MySQL, `'a' < 'a\t'` is equivalent to `'a ' < 'a\t'`, and the result is `0`. + +## Coercibility values of collations in expressions + +If an expression involves multiple clauses of different collations, you need to infer the collation used in the calculation. The rules are as follows: + ++ The coercibility value of the explicit `COLLATE` clause is `0`. ++ If the collations of two strings are incompatible, the coercibility value of the concatenation of two strings with different collations is `1`. Currently, all implemented collations are compatible with each other. ++ The column's collation has a coercibility value of `2`. ++ The system constant (the string returned by `USER ()` or `VERSION ()`) has a coercibility value of `3`. ++ The coercibility value of constants is `4`. ++ The coercibility value of numbers or intermediate variables is `5`. ++ `NULL` or expressions derived from `NULL` has a coercibility value of `6`. + +When inferring collations, TiDB prefers using the collation of expressions with lower coercibility values (the same as MySQL). If the coercibility values of two clauses are the same, the collation is determined according to the following priority: + +binary > utf8mb4_bin > utf8mb4_general_ci > utf8_bin > utf8_general_ci > latin1_bin > ascii_bin + +If the collations of two clauses are different and the coercibility value of both clauses is `0`, TiDB cannot infer the collation and reports an error. + +## `COLLATE` clause + +TiDB supports using the `COLLATE` clause to specify the collation of an expression. The coercibility value of this expression is `0`, which has the highest priority. See the following example: + +{{< copyable "sql" >}} + +```sql +select 'a' = 'A' collate utf8mb4_general_ci; +``` + +```sql ++--------------------------------------+ +| 'a' = 'A' collate utf8mb4_general_ci | ++--------------------------------------+ +| 1 | ++--------------------------------------+ +1 row in set (0.00 sec) +``` + +For more details, see [Connection Character Sets and Collations](https://dev.mysql.com/doc/refman/5.7/en/charset-connection.html). diff --git a/reference/sql/statements/alter-database.md b/reference/sql/statements/alter-database.md index dd776cb84c265..4d0a47e8f4f94 100644 --- a/reference/sql/statements/alter-database.md +++ b/reference/sql/statements/alter-database.md @@ -18,7 +18,7 @@ alter_specification: | [DEFAULT] COLLATE [=] collation_name ``` -The `alter_specification` option specifies the `CHARACTER SET` and `COLLATE` of a specified database. Currently, TiDB only supports some character sets and collations. See [Character Set Support](/reference/sql/character-set.md) for details. +The `alter_specification` option specifies the `CHARACTER SET` and `COLLATE` of a specified database. Currently, TiDB only supports some character sets and collations. See [Character Set and Collation Support](/reference/sql/characterset-and-collation.md) for details. ## See also diff --git a/reference/sql/statements/create-database.md b/reference/sql/statements/create-database.md index f5eeb4404b73d..9d08da6fd0666 100644 --- a/reference/sql/statements/create-database.md +++ b/reference/sql/statements/create-database.md @@ -45,7 +45,7 @@ create_specification: If you create an existing database and does not specify `IF NOT EXISTS`, an error is displayed. -The `create_specification` option is used to specify the specific `CHARACTER SET` and `COLLATE` in the database. Currently, TiDB only supports some of the character sets and collations. For details, see [Character Set Support](/reference/sql/character-set.md). +The `create_specification` option is used to specify the specific `CHARACTER SET` and `COLLATE` in the database. Currently, TiDB only supports some of the character sets and collations. For details, see [Character Set and Collation Support](/reference/sql/characterset-and-collation.md). ## Examples diff --git a/reference/sql/statements/set-names.md b/reference/sql/statements/set-names.md index 1b0ecea0208f7..fee1bd5e390dd 100644 --- a/reference/sql/statements/set-names.md +++ b/reference/sql/statements/set-names.md @@ -77,4 +77,4 @@ This statement is understood to be fully compatible with MySQL. Any compatibilit * [SHOW \[GLOBAL|SESSION\] VARIABLES](/reference/sql/statements/show-variables.md) * [SET ](/reference/sql/statements/set-variable.md) -* [Character Set Support](/reference/sql/character-set.md) +* [Character Set and Collation Support](/reference/sql/characterset-and-collation.md) diff --git a/reference/system-databases/information-schema.md b/reference/system-databases/information-schema.md index 9205582e17f8d..214317114083b 100644 --- a/reference/system-databases/information-schema.md +++ b/reference/system-databases/information-schema.md @@ -36,7 +36,7 @@ select * from `ANALYZE_STATUS`; ### CHARACTER_SETS table -The `CHARACTER_SETS` table provides information about [character sets](/reference/sql/character-set.md). Currently, TiDB only supports some of the character sets. +The `CHARACTER_SETS` table provides information about [character sets](/reference/sql/characterset-and-collation.md). Currently, TiDB only supports some of the character sets. {{< copyable "sql" >}} diff --git a/reference/tools/syncer.md b/reference/tools/syncer.md index 6842a6cb3763c..b065754912672 100644 --- a/reference/tools/syncer.md +++ b/reference/tools/syncer.md @@ -38,7 +38,7 @@ binlog-gtid = "2bfabd22-fff7-11e6-97f7-f02fa73bcb01:1-23,61ccbb5d-c82d-11e6-ac2e > **Note:** > > - The `syncer.meta` file only needs to be configured when it is first used. The position is automatically updated when the new subsequent binlog is replicated. -> - If you use the binlog position to replicate, you only need to configure `binlog-name` and `binlog-pos`; if you use `binlog-gtid` to replacate, you need to configure `binlog-gtid` and set `--enable-gtid` when starting Syncer. +> - If you use the binlog position to replicate, you only need to configure `binlog-name` and `binlog-pos`; if you use `binlog-gtid` to replicate, you need to configure `binlog-gtid` and set `--enable-gtid` when starting Syncer. ### 2. Start Syncer @@ -462,7 +462,7 @@ Before replicating data using Syncer, check the following items: 6. Check the Character Set. - TiDB differs from MySQL in [Character Set](/reference/sql/character-set.md). + TiDB differs from MySQL in [character sets](/reference/sql/characterset-and-collation.md). 7. Check whether the table to be replicated has a primary key or a unique index. From a6e8909cda4c7ddf9123dbdc4988fead81f5826b Mon Sep 17 00:00:00 2001 From: TomShawn <41534398+TomShawn@users.noreply.github.com> Date: Wed, 13 May 2020 16:50:33 +0800 Subject: [PATCH 2/2] Update reference/sql/characterset-and-collation.md --- reference/sql/characterset-and-collation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reference/sql/characterset-and-collation.md b/reference/sql/characterset-and-collation.md index 858685e1d719b..b5a8999700769 100644 --- a/reference/sql/characterset-and-collation.md +++ b/reference/sql/characterset-and-collation.md @@ -2,7 +2,7 @@ title: Character Set and Collation summary: Learn about the supported character sets and collations in TiDB. category: reference -aliases: ['/docs/dev/reference/sql/character-set/'] +aliases: ['/docs/stable/reference/sql/character-set/'] --- # Character Set and Collation