Use Arrow schema if available

NiallEgan · susodapop · commit ca562abbb9ea · 2022-06-02T11:14:59.000-05:00
This PR changes the Python client to use the Arrow schema if it has been sent by the server, instead of re-constructing an approximation from the Hive schema.

The primary difference is in the timezone information for timestamps

* Added new unit tests to check the correct field is used
* Adapted integration tests to add timezones as appropriate
diff --git a/cmdexec/clients/python/dev_requirements.txt b/cmdexec/clients/python/dev_requirements.txt
@@ -6,3 +6,4 @@ thrift==0.13.0
 pandas==1.3.4
 future==0.18.2
 packaging==21.3
+pytz==2021.3
diff --git a/cmdexec/clients/python/src/databricks/sql/client.py b/cmdexec/clients/python/src/databricks/sql/client.py
@@ -480,7 +480,7 @@ def __init__(self,
         self.arraysize = arraysize
         self.thrift_backend = thrift_backend
         self.description = execute_response.description
-        self._arrow_schema = execute_response.arrow_schema
+        self._arrow_schema_bytes = execute_response.arrow_schema_bytes
         self._next_row_index = 0
 
         if execute_response.arrow_queue:
@@ -505,7 +505,7 @@ def _fill_results_buffer(self):
             max_rows=self.arraysize,
             max_bytes=self.buffer_size_bytes,
             expected_row_start_offset=self._next_row_index,
-            arrow_schema=self._arrow_schema,
+            arrow_schema_bytes=self._arrow_schema_bytes,
             description=self.description)
         self.results = results
         self.has_more_rows = has_more_rows
diff --git a/cmdexec/clients/python/src/databricks/sql/thrift_backend.py b/cmdexec/clients/python/src/databricks/sql/thrift_backend.py
@@ -330,7 +330,7 @@ def open_session(self, session_configuration, catalog, schema):
                 initial_namespace = None
 
             open_session_req = ttypes.TOpenSessionReq(
-                client_protocol_i64=ttypes.TProtocolVersion.SPARK_CLI_SERVICE_PROTOCOL_V4,
+                client_protocol_i64=ttypes.TProtocolVersion.SPARK_CLI_SERVICE_PROTOCOL_V5,
                 client_protocol=None,
                 initialNamespace=initial_namespace,
                 canUseMultipleCatalogs=True,
@@ -376,13 +376,13 @@ def _poll_for_status(self, op_handle):
         )
         return self.make_request(self._client.GetOperationStatus, req)
 
-    def _create_arrow_table(self, t_row_set, arrow_schema, description):
+    def _create_arrow_table(self, t_row_set, schema_bytes, description):
         if t_row_set.columns is not None:
             arrow_table, num_rows = ThriftBackend._convert_column_based_set_to_arrow_table(
-                t_row_set.columns, arrow_schema)
+                t_row_set.columns, description)
         elif t_row_set.arrowBatches is not None:
             arrow_table, num_rows = ThriftBackend._convert_arrow_based_set_to_arrow_table(
-                t_row_set.arrowBatches, arrow_schema)
+                t_row_set.arrowBatches, schema_bytes)
         else:
             raise OperationalError("Unsupported TRowSet instance {}".format(t_row_set))
         return self._convert_decimals_in_arrow_table(arrow_table, description), num_rows
@@ -404,9 +404,8 @@ def _convert_decimals_in_arrow_table(table, description):
         return table
 
     @staticmethod
-    def _convert_arrow_based_set_to_arrow_table(arrow_batches, schema):
+    def _convert_arrow_based_set_to_arrow_table(arrow_batches, schema_bytes):
         ba = bytearray()
-        schema_bytes = schema.serialize().to_pybytes()
         ba += schema_bytes
         n_rows = 0
         for arrow_batch in arrow_batches:
@@ -416,13 +415,13 @@ def _convert_arrow_based_set_to_arrow_table(arrow_batches, schema):
         return arrow_table, n_rows
 
     @staticmethod
-    def _convert_column_based_set_to_arrow_table(columns, schema):
+    def _convert_column_based_set_to_arrow_table(columns, description):
         arrow_table = pyarrow.Table.from_arrays(
             [ThriftBackend._convert_column_to_arrow_array(c) for c in columns],
             # Only use the column names from the schema, the types are determined by the
             # physical types used in column based set, as they can differ from the
             # mapping used in _hive_schema_to_arrow_schema.
-            names=[c.name for c in schema])
+            names=[c[0] for c in description])
         return arrow_table, arrow_table.num_rows
 
     @staticmethod
@@ -555,13 +554,14 @@ def _results_message_to_execute_response(self, resp, operation_state):
         has_more_rows = (not direct_results) or (not direct_results.resultSet) \
                         or direct_results.resultSet.hasMoreRows
         description = self._hive_schema_to_description(t_result_set_metadata_resp.schema)
-        arrow_schema = self._hive_schema_to_arrow_schema(t_result_set_metadata_resp.schema)
+        schema_bytes = (t_result_set_metadata_resp.arrowSchema or self._hive_schema_to_arrow_schema(
+            t_result_set_metadata_resp.schema).serialize().to_pybytes())
 
         if direct_results and direct_results.resultSet:
             assert (direct_results.resultSet.results.startRowOffset == 0)
             assert (direct_results.resultSetMetadata)
             arrow_results, n_rows = self._create_arrow_table(direct_results.resultSet.results,
-                                                             arrow_schema, description)
+                                                             schema_bytes, description)
             arrow_queue_opt = ArrowQueue(arrow_results, n_rows, 0)
         else:
             arrow_queue_opt = None
@@ -572,7 +572,7 @@ def _results_message_to_execute_response(self, resp, operation_state):
             has_more_rows=has_more_rows,
             command_handle=resp.operationHandle,
             description=description,
-            arrow_schema=arrow_schema)
+            arrow_schema_bytes=schema_bytes)
 
     def _wait_until_command_done(self, op_handle, initial_operation_status_resp):
         if initial_operation_status_resp:
@@ -697,8 +697,8 @@ def _handle_execute_response(self, resp, cursor):
 
         return self._results_message_to_execute_response(resp, final_operation_state)
 
-    def fetch_results(self, op_handle, max_rows, max_bytes, expected_row_start_offset, arrow_schema,
-                      description):
+    def fetch_results(self, op_handle, max_rows, max_bytes, expected_row_start_offset,
+                      arrow_schema_bytes, description):
         assert (op_handle is not None)
 
         req = ttypes.TFetchResultsReq(
@@ -716,7 +716,8 @@ def fetch_results(self, op_handle, max_rows, max_bytes, expected_row_start_offse
         if resp.results.startRowOffset > expected_row_start_offset:
             logger.warning("Expected results to start from {} but they instead start at {}".format(
                 expected_row_start_offset, resp.results.startRowOffset))
-        arrow_results, n_rows = self._create_arrow_table(resp.results, arrow_schema, description)
+        arrow_results, n_rows = self._create_arrow_table(resp.results, arrow_schema_bytes,
+                                                         description)
         arrow_queue = ArrowQueue(arrow_results, n_rows)
 
         return arrow_queue, resp.hasMoreRows
diff --git a/cmdexec/clients/python/src/databricks/sql/utils.py b/cmdexec/clients/python/src/databricks/sql/utils.py
@@ -35,7 +35,7 @@ def remaining_rows(self) -> pyarrow.Table:
 
 ExecuteResponse = namedtuple(
     'ExecuteResponse', 'status has_been_closed_server_side has_more_rows description '
-    'command_handle arrow_queue arrow_schema')
+    'command_handle arrow_queue arrow_schema_bytes')
 
 
 def _bound(min_x, max_x, x):
diff --git a/cmdexec/clients/python/tests/test_fetches.py b/cmdexec/clients/python/tests/test_fetches.py
@@ -42,7 +42,7 @@ def make_dummy_result_set_from_initial_results(initial_results):
                 description=Mock(),
                 command_handle=None,
                 arrow_queue=arrow_queue,
-                arrow_schema=schema))
+                arrow_schema_bytes=schema.serialize().to_pybytes()))
         num_cols = len(initial_results[0]) if initial_results else 0
         rs.description = [(f'col{col_id}', 'integer', None, None, None, None, None)
                           for col_id in range(num_cols)]
@@ -52,8 +52,8 @@ def make_dummy_result_set_from_initial_results(initial_results):
     def make_dummy_result_set_from_batch_list(batch_list):
         batch_index = 0
 
-        def fetch_results(op_handle, max_rows, max_bytes, expected_row_start_offset, arrow_schema,
-                          description):
+        def fetch_results(op_handle, max_rows, max_bytes, expected_row_start_offset,
+                          arrow_schema_bytes, description):
             nonlocal batch_index
             results = FetchTests.make_arrow_queue(batch_list[batch_index])
             batch_index += 1
@@ -75,7 +75,7 @@ def fetch_results(op_handle, max_rows, max_bytes, expected_row_start_offset, arr
                              for col_id in range(num_cols)],
                 command_handle=None,
                 arrow_queue=None,
-                arrow_schema=None))
+                arrow_schema_bytes=None))
         return rs
 
     def assertEqualRowValues(self, actual, expected):
diff --git a/cmdexec/clients/python/tests/test_thrift_backend.py b/cmdexec/clients/python/tests/test_thrift_backend.py
@@ -497,6 +497,54 @@ def test_handle_execute_response_can_handle_with_direct_results(self):
                     ttypes.TOperationState.FINISHED_STATE,
                 )
 
+    @patch("databricks.sql.thrift_backend.TCLIService.Client")
+    def test_use_arrow_schema_if_available(self, tcli_service_class):
+        tcli_service_instance = tcli_service_class.return_value
+        arrow_schema_mock = MagicMock(name="Arrow schema mock")
+        hive_schema_mock = MagicMock(name="Hive schema mock")
+
+        t_get_result_set_metadata_resp = ttypes.TGetResultSetMetadataResp(
+            status=self.okay_status,
+            resultFormat=ttypes.TSparkRowSetType.ARROW_BASED_SET,
+            schema=hive_schema_mock,
+            arrowSchema=arrow_schema_mock)
+
+        t_execute_resp = ttypes.TExecuteStatementResp(
+            status=self.okay_status,
+            directResults=None,
+            operationHandle=self.operation_handle,
+        )
+
+        tcli_service_instance.GetResultSetMetadata.return_value = t_get_result_set_metadata_resp
+        thrift_backend = self._make_fake_thrift_backend()
+        execute_response = thrift_backend._handle_execute_response(t_execute_resp, Mock())
+
+        self.assertEqual(execute_response.arrow_schema_bytes, arrow_schema_mock)
+
+    @patch("databricks.sql.thrift_backend.TCLIService.Client")
+    def test_fall_back_to_hive_schema_if_no_arrow_schema(self, tcli_service_class):
+        tcli_service_instance = tcli_service_class.return_value
+        hive_schema_mock = MagicMock(name="Hive schema mock")
+
+        hive_schema_req = ttypes.TGetResultSetMetadataResp(
+            status=self.okay_status,
+            resultFormat=ttypes.TSparkRowSetType.ARROW_BASED_SET,
+            arrowSchema=None,
+            schema=hive_schema_mock)
+
+        t_execute_resp = ttypes.TExecuteStatementResp(
+            status=self.okay_status,
+            directResults=None,
+            operationHandle=self.operation_handle,
+        )
+
+        tcli_service_instance.GetResultSetMetadata.return_value = hive_schema_req
+        thrift_backend = self._make_fake_thrift_backend()
+        thrift_backend._handle_execute_response(t_execute_resp, Mock())
+
+        self.assertEqual(hive_schema_mock,
+                         thrift_backend._hive_schema_to_arrow_schema.call_args[0][0])
+
     @patch("databricks.sql.thrift_backend.TCLIService.Client")
     def test_handle_execute_response_reads_has_more_rows_in_direct_results(
             self, tcli_service_class):
@@ -567,7 +615,7 @@ def test_handle_execute_response_reads_has_more_rows_in_result_response(
                     max_rows=1,
                     max_bytes=1,
                     expected_row_start_offset=0,
-                    arrow_schema=Mock(),
+                    arrow_schema_bytes=Mock(),
                     description=Mock())
 
                 self.assertEqual(has_more_rows, has_more_rows_resp)
@@ -591,15 +639,15 @@ def test_arrow_batches_row_count_are_respected(self, tcli_service_class):
             pyarrow.field("column2", pyarrow.string()),
             pyarrow.field("column3", pyarrow.float64()),
             pyarrow.field("column3", pyarrow.binary())
-        ])
+        ]).serialize().to_pybytes()
 
         thrift_backend = ThriftBackend("foobar", 443, "path", [])
         arrow_queue, has_more_results = thrift_backend.fetch_results(
             op_handle=Mock(),
             max_rows=1,
             max_bytes=1,
             expected_row_start_offset=0,
-            arrow_schema=schema,
+            arrow_schema_bytes=schema,
             description=MagicMock())
 
         self.assertEqual(arrow_queue.n_valid_rows, 15 * 10)
@@ -792,24 +840,21 @@ def test_create_arrow_table_calls_correct_conversion_method(self, convert_col_mo
         schema = Mock()
         cols = Mock()
         arrow_batches = Mock()
+        description = Mock()
 
         t_col_set = ttypes.TRowSet(columns=cols)
-        thrift_backend._create_arrow_table(t_col_set, schema, Mock())
+        thrift_backend._create_arrow_table(t_col_set, schema, description)
         convert_arrow_mock.assert_not_called()
-        convert_col_mock.assert_called_once_with(cols, schema)
+        convert_col_mock.assert_called_once_with(cols, description)
 
         t_arrow_set = ttypes.TRowSet(arrowBatches=arrow_batches)
         thrift_backend._create_arrow_table(t_arrow_set, schema, Mock())
         convert_arrow_mock.assert_called_once_with(arrow_batches, schema)
-        convert_col_mock.assert_called_once_with(cols, schema)
 
     def test_convert_column_based_set_to_arrow_table_without_nulls(self):
-        schema = pyarrow.schema([
-            pyarrow.field("column1", pyarrow.int32()),
-            pyarrow.field("column2", pyarrow.string()),
-            pyarrow.field("column3", pyarrow.float64()),
-            pyarrow.field("column3", pyarrow.binary())
-        ])
+        # Deliberately duplicate the column name to check that dups work
+        field_names = ["column1", "column2", "column3", "column3"]
+        description = [(name, ) for name in field_names]
 
         t_cols = [
             ttypes.TColumn(i32Val=ttypes.TI32Column(values=[1, 2, 3], nulls=bytes(1))),
@@ -820,7 +865,8 @@ def test_convert_column_based_set_to_arrow_table_without_nulls(self):
                 binaryVal=ttypes.TBinaryColumn(values=[b'\x11', b'\x22', b'\x33'], nulls=bytes(1)))
         ]
 
-        arrow_table, n_rows = ThriftBackend._convert_column_based_set_to_arrow_table(t_cols, schema)
+        arrow_table, n_rows = ThriftBackend._convert_column_based_set_to_arrow_table(
+            t_cols, description)
         self.assertEqual(n_rows, 3)
 
         # Check schema, column names and types
@@ -841,12 +887,8 @@ def test_convert_column_based_set_to_arrow_table_without_nulls(self):
         self.assertEqual(arrow_table.column(3).to_pylist(), [b'\x11', b'\x22', b'\x33'])
 
     def test_convert_column_based_set_to_arrow_table_with_nulls(self):
-        schema = pyarrow.schema([
-            pyarrow.field("column1", pyarrow.int32()),
-            pyarrow.field("column2", pyarrow.string()),
-            pyarrow.field("column3", pyarrow.float64()),
-            pyarrow.field("column3", pyarrow.binary())
-        ])
+        field_names = ["column1", "column2", "column3", "column3"]
+        description = [(name, ) for name in field_names]
 
         t_cols = [
             ttypes.TColumn(i32Val=ttypes.TI32Column(values=[1, 2, 3], nulls=bytes([1]))),
@@ -859,7 +901,8 @@ def test_convert_column_based_set_to_arrow_table_with_nulls(self):
                     values=[b'\x11', b'\x22', b'\x33'], nulls=bytes([3])))
         ]
 
-        arrow_table, n_rows = ThriftBackend._convert_column_based_set_to_arrow_table(t_cols, schema)
+        arrow_table, n_rows = ThriftBackend._convert_column_based_set_to_arrow_table(
+            t_cols, description)
         self.assertEqual(n_rows, 3)
 
         # Check data
@@ -869,12 +912,8 @@ def test_convert_column_based_set_to_arrow_table_with_nulls(self):
         self.assertEqual(arrow_table.column(3).to_pylist(), [None, None, b'\x33'])
 
     def test_convert_column_based_set_to_arrow_table_uses_types_from_col_set(self):
-        schema = pyarrow.schema([
-            pyarrow.field("column1", pyarrow.string()),
-            pyarrow.field("column2", pyarrow.string()),
-            pyarrow.field("column3", pyarrow.string()),
-            pyarrow.field("column3", pyarrow.string())
-        ])
+        field_names = ["column1", "column2", "column3", "column3"]
+        description = [(name, ) for name in field_names]
 
         t_cols = [
             ttypes.TColumn(i32Val=ttypes.TI32Column(values=[1, 2, 3], nulls=bytes(1))),
@@ -885,7 +924,8 @@ def test_convert_column_based_set_to_arrow_table_uses_types_from_col_set(self):
                 binaryVal=ttypes.TBinaryColumn(values=[b'\x11', b'\x22', b'\x33'], nulls=bytes(1)))
         ]
 
-        arrow_table, n_rows = ThriftBackend._convert_column_based_set_to_arrow_table(t_cols, schema)
+        arrow_table, n_rows = ThriftBackend._convert_column_based_set_to_arrow_table(
+            t_cols, description)
         self.assertEqual(n_rows, 3)
 
         # Check schema, column names and types