[go: nahoru, domu]

Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: consolidate PyarrowVersions helpers #1679

Merged
merged 16 commits into from
Oct 18, 2023
Prev Previous commit
Next Next commit
address comments
  • Loading branch information
Linchin committed Oct 14, 2023
commit 465e3786498e3eb584dba90b303fb197c363385d
13 changes: 7 additions & 6 deletions google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

from google.cloud.bigquery import _helpers
from google.cloud.bigquery import _pyarrow_helpers
from google.cloud.bigquery import _versions_helpers
from google.cloud.bigquery import schema

try:
Expand All @@ -48,7 +49,7 @@
db_dtypes_import_exception = exc
date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype

pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import()
pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import()

_BIGNUMERIC_SUPPORT = False
if pyarrow is not None:
Expand Down Expand Up @@ -162,7 +163,7 @@ def bq_to_arrow_data_type(field):
if field_type_upper in schema._STRUCT_TYPES:
Linchin marked this conversation as resolved.
Show resolved Hide resolved
return bq_to_arrow_struct_data_type(field)

data_type_constructor = _pyarrow_helpers.PYARROW_VERSIONS.bq_to_arrow_scalars(
data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars(
field_type_upper
)
if data_type_constructor is None:
Expand Down Expand Up @@ -492,7 +493,7 @@ def augment_schema(dataframe, current_bq_schema):
if pyarrow.types.is_list(arrow_table.type):
# `pyarrow.ListType`
detected_mode = "REPEATED"
detected_type = _pyarrow_helpers.PYARROW_VERSIONS.arrow_scalar_ids_to_bq(
detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(
arrow_table.values.type.id
)

Expand All @@ -510,7 +511,7 @@ def augment_schema(dataframe, current_bq_schema):
detected_type = "DATETIME"
else:
detected_mode = field.mode
detected_type = _pyarrow_helpers.PYARROW_VERSIONS.arrow_scalar_ids_to_bq(
detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(
arrow_table.type.id
)

Expand Down Expand Up @@ -633,13 +634,13 @@ def dataframe_to_parquet(

This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``.
"""
pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import(raise_if_error=True)
pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import(raise_if_error=True)

import pyarrow.parquet # type: ignore

kwargs = (
{"use_compliant_nested_type": parquet_use_compliant_nested_type}
if _pyarrow_helpers.PYARROW_VERSIONS.use_compliant_nested_type
if _versions_helpers.PYARROW_VERSIONS.use_compliant_nested_type
else {}
)

Expand Down
106 changes: 15 additions & 91 deletions google/cloud/bigquery/_pyarrow_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,6 @@

from typing import Any

from google.cloud.bigquery.exceptions import LegacyPyarrowError

import packaging.version

_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0")

try:
import pyarrow # type: ignore
except ImportError: # pragma: NO COVER
Expand Down Expand Up @@ -101,88 +95,18 @@ def pyarrow_timestamp():
pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC",
}


class PyarrowVersions:
"""Version comparisons for pyarrow package."""

def __init__(self):
self._installed_version = None

@property
def installed_version(self) -> packaging.version.Version:
"""Return the parsed version of pyarrow."""
if self._installed_version is None:
import pyarrow # type: ignore

self._installed_version = packaging.version.parse(
# Use 0.0.0, since it is earlier than any released version.
# Legacy versions also have the same property, but
# creating a LegacyVersion has been deprecated.
# https://github.com/pypa/packaging/issues/321
getattr(pyarrow, "__version__", "0.0.0")
)

return self._installed_version

@staticmethod
def bq_to_arrow_scalars(bq_scalar: str):
"""
Returns:
The Arrow scalar type that the input BigQuery scalar type maps to.
If cannot find the BigQuery scalar, return None.
"""
return _BQ_TO_ARROW_SCALARS.get(bq_scalar)

@staticmethod
def arrow_scalar_ids_to_bq(arrow_scalar: Any):
"""
Returns:
The BigQuery scalar type that the input arrow scalar type maps to.
If cannot find the arrow scalar, return None.
"""
return _ARROW_SCALAR_IDS_TO_BQ.get(arrow_scalar)

@property
def use_compliant_nested_type(self) -> bool:
return self.installed_version.major >= 4

def try_import(self, raise_if_error: bool = False) -> Any:
"""Verify that a recent enough version of pyarrow extra is
installed.

The function assumes that pyarrow extra is installed, and should thus
be used in places where this assumption holds.

Because `pip` can install an outdated version of this extra despite the
constraints in `setup.py`, the calling code can use this helper to
verify the version compatibility at runtime.

Returns:
The ``pyarrow`` module or ``None``.

Raises:
LegacyPyarrowError:
If the pyarrow package is outdated and ``raise_if_error`` is ``True``.
"""
try:
import pyarrow
except ImportError as exc: # pragma: NO COVER
if raise_if_error:
raise LegacyPyarrowError(
f"pyarrow package not found. Install pyarrow version >= {_MIN_PYARROW_VERSION}."
) from exc
return None

if self.installed_version < _MIN_PYARROW_VERSION:
if raise_if_error:
msg = (
"Dependency pyarrow is outdated, please upgrade "
f"it to version >= {_MIN_PYARROW_VERSION} (version found: {self.installed_version})."
)
raise LegacyPyarrowError(msg)
return None

return pyarrow


PYARROW_VERSIONS = PyarrowVersions()
def bq_to_arrow_scalars(bq_scalar: str):
"""
Returns:
The Arrow scalar type that the input BigQuery scalar type maps to.
If it cannot find the BigQuery scalar, return None.
"""
return _BQ_TO_ARROW_SCALARS.get(bq_scalar)

def arrow_scalar_ids_to_bq(arrow_scalar: Any):
"""
Returns:
The BigQuery scalar type that the input arrow scalar type maps to.
If it cannot find the arrow scalar, return None.
"""
return _ARROW_SCALAR_IDS_TO_BQ.get(arrow_scalar)
96 changes: 96 additions & 0 deletions google/cloud/bigquery/_versions_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Shared helper functions for verifying versions of installed modules."""

from typing import Any

import packaging.version

from google.cloud.bigquery.exceptions import LegacyPyarrowError
Linchin marked this conversation as resolved.
Show resolved Hide resolved


_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0")

# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414
_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")])
Linchin marked this conversation as resolved.
Show resolved Hide resolved

class PyarrowVersions:
"""Version comparisons for pyarrow package."""

def __init__(self):
self._installed_version = None

@property
def installed_version(self) -> packaging.version.Version:
"""Return the parsed version of pyarrow."""
if self._installed_version is None:
import pyarrow # type: ignore

self._installed_version = packaging.version.parse(
# Use 0.0.0, since it is earlier than any released version.
# Legacy versions also have the same property, but
# creating a LegacyVersion has been deprecated.
# https://github.com/pypa/packaging/issues/321
getattr(pyarrow, "__version__", "0.0.0")
)

return self._installed_version

@property
def use_compliant_nested_type(self) -> bool:
return self.installed_version.major >= 4

def try_import(self, raise_if_error: bool = False) -> Any:
"""Verify that a recent enough version of pyarrow extra is installed.

The function assumes that pyarrow extra is installed, and should thus
be used in places where this assumption holds.

Because `pip` can install an outdated version of this extra despite
the constraints in `setup.py`, the calling code can use this helper
to verify the version compatibility at runtime.

Returns:
The ``pyarrow`` module or ``None``.

Raises:
LegacyPyarrowError:
If the pyarrow package is outdated and ``raise_if_error`` is
``True``.
"""
try:
import pyarrow
except ImportError as exc: # pragma: NO COVER
if raise_if_error:
raise LegacyPyarrowError(
"pyarrow package not found. Install pyarrow version >="
f" {_MIN_PYARROW_VERSION}."
) from exc
return None

if self.installed_version < _MIN_PYARROW_VERSION:
if raise_if_error:
msg = (
"Dependency pyarrow is outdated, please upgrade"
f" it to version >= {_MIN_PYARROW_VERSION}"
f" (version found: {self.installed_version})."
)
raise LegacyPyarrowError(msg)
return None

return pyarrow


PYARROW_VERSIONS = PyarrowVersions()
10 changes: 6 additions & 4 deletions google/cloud/bigquery/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,14 @@
from google.cloud.bigquery._helpers import _DEFAULT_HOST
from google.cloud.bigquery._http import Connection
from google.cloud.bigquery import _pandas_helpers
from google.cloud.bigquery import _versions_helpers
from google.cloud.bigquery.dataset import Dataset
from google.cloud.bigquery.dataset import DatasetListItem
from google.cloud.bigquery.dataset import DatasetReference
from google.cloud.bigquery import enums
from google.cloud.bigquery.enums import AutoRowIDs
from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
from google.cloud.bigquery.exceptions import LegacyPyarrowError
from google.cloud.bigquery.opentelemetry_tracing import create_span
from google.cloud.bigquery import job
from google.cloud.bigquery.job import (
Expand Down Expand Up @@ -114,9 +116,7 @@
from google.cloud.bigquery.table import RowIterator
from google.cloud.bigquery.format_options import ParquetOptions

from google.cloud.bigquery import _pyarrow_helpers

pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import()
pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import()

TimeoutType = Union[float, None]
ResumableTimeoutType = Union[
Expand Down Expand Up @@ -2678,6 +2678,8 @@ def load_table_from_dataframe(

try:
if new_job_config.source_format == job.SourceFormat.PARQUET:
_versions_helpers.PYARROW_VERSIONS.try_import()

if new_job_config.schema:
if parquet_compression == "snappy": # adjust the default value
parquet_compression = parquet_compression.upper()
Expand All @@ -2696,7 +2698,7 @@ def load_table_from_dataframe(
compression=parquet_compression,
**(
{"use_compliant_nested_type": True}
if _pyarrow_helpers.PYARROW_VERSIONS.use_compliant_nested_type
if _versions_helpers.PYARROW_VERSIONS.use_compliant_nested_type
else {}
),
)
Expand Down
3 changes: 1 addition & 2 deletions google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@
# Unconditionally import optional dependencies again to tell pytype that
# they are not None, avoiding false "no attribute" errors.
import pandas

pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import()
import pyarrow
import geopandas # type: ignore
from google.cloud import bigquery_storage # type: ignore
from google.cloud.bigquery.dataset import DatasetReference
Expand Down
7 changes: 7 additions & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,13 @@ def unit(session):
@nox.session(python=[UNIT_TEST_PYTHON_VERSIONS[0], UNIT_TEST_PYTHON_VERSIONS[-1]])
def unit_noextras(session):
"""Run the unit test suite."""

# Install optional dependencies that are out-of-date.
# https://github.com/googleapis/python-bigquery/issues/933
# There is no pyarrow 1.0.0 package for Python 3.9.
if session.python == UNIT_TEST_PYTHON_VERSIONS[0]:
session.install("pyarrow==1.0.0")

default(session, install_extras=False)


Expand Down
5 changes: 3 additions & 2 deletions tests/unit/test__pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,11 @@
from google.cloud.bigquery import exceptions
from google.cloud.bigquery import _helpers
from google.cloud.bigquery import _pyarrow_helpers
from google.cloud.bigquery import _versions_helpers
from google.cloud.bigquery import schema
from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT

pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import()
pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import()

if pyarrow:
import pyarrow.parquet
Expand Down Expand Up @@ -1119,7 +1120,7 @@ def test_dataframe_to_parquet_without_pyarrow(module_under_test, monkeypatch):
"pyarrow not installed"
)
monkeypatch.setattr(
_pyarrow_helpers.PYARROW_VERSIONS, "try_import", mock_pyarrow_import
_versions_helpers.PYARROW_VERSIONS, "try_import", mock_pyarrow_import
)

with pytest.raises(exceptions.LegacyPyarrowError):
Expand Down
Loading