From 7afeb8b8078018a167e7bce6421024b95517b234 Mon Sep 17 00:00:00 2001
From: Asura7969 <1402357969@qq.com>
Date: Wed, 8 Nov 2023 17:04:53 +0800
Subject: [PATCH 1/2] Minor: Improve the document format of JoinHashMap

---
 .../src/joins/hash_join_utils.rs              | 115 ++++++++++--------
 1 file changed, 62 insertions(+), 53 deletions(-)
diff --git a/datafusion/physical-plan/src/joins/hash_join_utils.rs b/datafusion/physical-plan/src/joins/hash_join_utils.rs
index 3a2a85c727226..3ea0331ab4fe8 100644
--- a/datafusion/physical-plan/src/joins/hash_join_utils.rs
+++ b/datafusion/physical-plan/src/joins/hash_join_utils.rs
@@ -40,59 +40,68 @@ use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
 use hashbrown::raw::RawTable;
 use hashbrown::HashSet;
 
-// Maps a `u64` hash value based on the build side ["on" values] to a list of indices with this key's value.
-// By allocating a `HashMap` with capacity for *at least* the number of rows for entries at the build side,
-// we make sure that we don't have to re-hash the hashmap, which needs access to the key (the hash in this case) value.
-// E.g. 1 -> [3, 6, 8] indicates that the column values map to rows 3, 6 and 8 for hash value 1
-// As the key is a hash value, we need to check possible hash collisions in the probe stage
-// During this stage it might be the case that a row is contained the same hashmap value,
-// but the values don't match. Those are checked in the [equal_rows] macro
-// The indices (values) are stored in a separate chained list stored in the `Vec<u64>`.
-// The first value (+1) is stored in the hashmap, whereas the next value is stored in array at the position value.
-// The chain can be followed until the value "0" has been reached, meaning the end of the list.
-// Also see chapter 5.3 of [Balancing vectorized query execution with bandwidth-optimized storage](https://dare.uva.nl/search?identifier=5ccbb60a-38b8-4eeb-858a-e7735dd37487)
-// See the example below:
-// Insert (1,1)
-// map:
-// ---------
-// | 1 | 2 |
-// ---------
-// next:
-// ---------------------
-// | 0 | 0 | 0 | 0 | 0 |
-// ---------------------
-// Insert (2,2)
-// map:
-// ---------
-// | 1 | 2 |
-// | 2 | 3 |
-// ---------
-// next:
-// ---------------------
-// | 0 | 0 | 0 | 0 | 0 |
-// ---------------------
-// Insert (1,3)
-// map:
-// ---------
-// | 1 | 4 |
-// | 2 | 3 |
-// ---------
-// next:
-// ---------------------
-// | 0 | 0 | 0 | 2 | 0 |  <--- hash value 1 maps to 4,2 (which means indices values 3,1)
-// ---------------------
-// Insert (1,4)
-// map:
-// ---------
-// | 1 | 5 |
-// | 2 | 3 |
-// ---------
-// next:
-// ---------------------
-// | 0 | 0 | 0 | 2 | 4 | <--- hash value 1 maps to 5,4,2 (which means indices values 4,3,1)
-// ---------------------
-// TODO: speed up collision checks
-// https://github.com/apache/arrow-datafusion/issues/50
+/// Maps a `u64` hash value based on the build side ["on" values] to a list of indices with this key's value.
+///
+/// By allocating a `HashMap` with capacity for *at least* the number of rows for entries at the build side,
+/// we make sure that we don't have to re-hash the hashmap, which needs access to the key (the hash in this case) value.
+///
+/// E.g. 1 -> [3, 6, 8] indicates that the column values map to rows 3, 6 and 8 for hash value 1
+/// As the key is a hash value, we need to check possible hash collisions in the probe stage
+/// During this stage it might be the case that a row is contained the same hashmap value,
+/// but the values don't match. Those are checked in the [equal_rows] macro
+/// The indices (values) are stored in a separate chained list stored in the `Vec<u64>`.
+///
+/// The first value (+1) is stored in the hashmap, whereas the next value is stored in array at the position value.
+///
+/// The chain can be followed until the value "0" has been reached, meaning the end of the list.
+/// Also see chapter 5.3 of [Balancing vectorized query execution with bandwidth-optimized storage](https://dare.uva.nl/search?identifier=5ccbb60a-38b8-4eeb-858a-e7735dd37487)
+///
+/// # Example
+///
+/// ``` text
+/// See the example below:
+/// Insert (1,1)
+/// map:
+/// ---------
+/// | 1 | 2 |
+/// ---------
+/// next:
+/// ---------------------
+/// | 0 | 0 | 0 | 0 | 0 |
+/// ---------------------
+/// Insert (2,2)
+/// map:
+/// ---------
+/// | 1 | 2 |
+/// | 2 | 3 |
+/// ---------
+/// next:
+/// ---------------------
+/// | 0 | 0 | 0 | 0 | 0 |
+/// ---------------------
+/// Insert (1,3)
+/// map:
+/// ---------
+/// | 1 | 4 |
+/// | 2 | 3 |
+/// ---------
+/// next:
+/// ---------------------
+/// | 0 | 0 | 0 | 2 | 0 |  <--- hash value 1 maps to 4,2 (which means indices values 3,1)
+/// ---------------------
+/// Insert (1,4)
+/// map:
+/// ---------
+/// | 1 | 5 |
+/// | 2 | 3 |
+/// ---------
+/// next:
+/// ---------------------
+/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 1 maps to 5,4,2 (which means indices values 4,3,1)
+/// ---------------------
+/// ```
+///
+///TODO: [speed up collision checks](https://github.com/apache/arrow-datafusion/issues/50)
 pub struct JoinHashMap {
     // Stores hash value to last row index
     pub map: RawTable<(u64, u64)>,

From 733d00016bc9f7fcae03365d2f227db7ef0cfdec Mon Sep 17 00:00:00 2001
From: Asura7969 <1402357969@qq.com>
Date: Fri, 17 Nov 2023 10:39:10 +0800
Subject: [PATCH 2/2] Port tests in describe.rs to sqllogictest

---
 datafusion/core/tests/sql/describe.rs         | 72 -------------------
 datafusion/core/tests/sql/mod.rs              |  1 -
 .../sqllogictest/test_files/describe.slt      | 24 +++++++
 3 files changed, 24 insertions(+), 73 deletions(-)
 delete mode 100644 datafusion/core/tests/sql/describe.rs

diff --git a/datafusion/core/tests/sql/describe.rs b/datafusion/core/tests/sql/describe.rs
deleted file mode 100644
index cd8e79b2c93b1..0000000000000
--- a/datafusion/core/tests/sql/describe.rs
+++ /dev/null
@@ -1,72 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use datafusion::assert_batches_eq;
-use datafusion::prelude::*;
-use datafusion_common::test_util::parquet_test_data;
-
-#[tokio::test]
-async fn describe_plan() {
-    let ctx = parquet_context().await;
-
-    let query = "describe alltypes_tiny_pages";
-    let results = ctx.sql(query).await.unwrap().collect().await.unwrap();
-
-    let expected = vec![
-        "+-----------------+-----------------------------+-------------+",
-        "| column_name     | data_type                   | is_nullable |",
-        "+-----------------+-----------------------------+-------------+",
-        "| id              | Int32                       | YES         |",
-        "| bool_col        | Boolean                     | YES         |",
-        "| tinyint_col     | Int8                        | YES         |",
-        "| smallint_col    | Int16                       | YES         |",
-        "| int_col         | Int32                       | YES         |",
-        "| bigint_col      | Int64                       | YES         |",
-        "| float_col       | Float32                     | YES         |",
-        "| double_col      | Float64                     | YES         |",
-        "| date_string_col | Utf8                        | YES         |",
-        "| string_col      | Utf8                        | YES         |",
-        "| timestamp_col   | Timestamp(Nanosecond, None) | YES         |",
-        "| year            | Int32                       | YES         |",
-        "| month           | Int32                       | YES         |",
-        "+-----------------+-----------------------------+-------------+",
-    ];
-
-    assert_batches_eq!(expected, &results);
-
-    // also ensure we plan Describe via SessionState
-    let state = ctx.state();
-    let plan = state.create_logical_plan(query).await.unwrap();
-    let df = DataFrame::new(state, plan);
-    let results = df.collect().await.unwrap();
-
-    assert_batches_eq!(expected, &results);
-}
-
-/// Return a SessionContext with parquet file registered
-async fn parquet_context() -> SessionContext {
-    let ctx = SessionContext::new();
-    let testdata = parquet_test_data();
-    ctx.register_parquet(
-        "alltypes_tiny_pages",
-        &format!("{testdata}/alltypes_tiny_pages.parquet"),
-        ParquetReadOptions::default(),
-    )
-    .await
-    .unwrap();
-    ctx
-}
diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs
index 40a9e627a72ab..d550656533010 100644
--- a/datafusion/core/tests/sql/mod.rs
+++ b/datafusion/core/tests/sql/mod.rs
@@ -75,7 +75,6 @@ macro_rules! test_expression {
 pub mod aggregates;
 pub mod create_drop;
 pub mod csv_files;
-pub mod describe;
 pub mod displayable;
 pub mod explain_analyze;
 pub mod expr;
diff --git a/datafusion/sqllogictest/test_files/describe.slt b/datafusion/sqllogictest/test_files/describe.slt
index 007aec443cbc9..f94a2e453884f 100644
--- a/datafusion/sqllogictest/test_files/describe.slt
+++ b/datafusion/sqllogictest/test_files/describe.slt
@@ -62,3 +62,27 @@ DROP TABLE aggregate_simple;
 
 statement error Error during planning: table 'datafusion.public.../core/tests/data/aggregate_simple.csv' not found
 DESCRIBE '../core/tests/data/aggregate_simple.csv';
+
+##########
+# Describe command
+##########
+
+statement ok
+CREATE EXTERNAL TABLE alltypes_tiny_pages STORED AS PARQUET LOCATION '../../parquet-testing/data/alltypes_tiny_pages.parquet';
+
+query TTT
+describe alltypes_tiny_pages;
+----
+id Int32 YES
+bool_col Boolean YES
+tinyint_col Int8 YES
+smallint_col Int16 YES
+int_col Int32 YES
+bigint_col Int64 YES
+float_col Float32 YES
+double_col Float64 YES
+date_string_col Utf8 YES
+string_col Utf8 YES
+timestamp_col Timestamp(Nanosecond, None) YES
+year Int32 YES
+month Int32 YES