ARROW-180 Support PyArrow 13 (#168)

blink1073 · web-flow · commit c6dd78b42019 · 2023-09-19T11:24:57.000-05:00
* ARROW-180 Support PyArrow 13

* fix prerelease handling

* clean up pytest invocation

* remove Py3.12 support

* switch back to ubuntu 20

* fix manifest

* Use a manylinux wheel for linux builds

* fix handling of env variable

* fix handling of env variable

* revert wheel building on linux

* try building pyarrow from src

* fixup

* fix

* fix and try py312

* try without gcc override

* update to match pyarrow wheel target

* fixups

* cleanup

* fixups

* fix min version

* undo changes to benchmarks file

* add debug print

* clean up asv

* skip failing asv methods

* more skips

* try without setup_cache

* fixups

* more skips

* undo changes

* undo change to workflow

* undo changes to dev guide

* fix gcc version

* address review
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -42,15 +42,15 @@ jobs:
 
       - name: Run benchmarks
         run: |
-          set -eu
+          set -eux
           run_asv () {
               if [ ! -e "asv.conf.json" ] ; then
                   git checkout refs/bm/pr asv.conf.json
                   git checkout refs/bm/pr benchmarks/__init__.py
                   git checkout refs/bm/pr benchmarks/benchmarks.py
               fi
               git show --no-patch --format="%H (%s)"
-              asv run --python=`which python` --set-commit-hash $(git rev-parse HEAD)
+              asv run -e --python=`which python` --set-commit-hash $(git rev-parse HEAD)
           }
 
           asv machine --yes
diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml
@@ -48,6 +48,9 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: ${{env.PYTHON_VERSION}}
+          cache: 'pip'
+          cache-dependency-path: 'bindings/python/pyproject.toml'
+          allow-prereleases: true
 
       - name: Set up QEMU
         if: runner.os == 'Linux'
@@ -84,7 +87,7 @@ jobs:
 
   make_sdist:
     name: Make SDist
-    runs-on: ubuntu-latest
+    runs-on: macos-latest
     steps:
       - uses: actions/checkout@v3
         with:
diff --git a/.github/workflows/test-python.yml b/.github/workflows/test-python.yml
@@ -21,7 +21,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: actions/setup-python@v4
       - uses: pre-commit/action@v2.0.0
         with:
@@ -32,12 +32,12 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: ["ubuntu-20.04", "macos-latest", "windows-latest"]
+        os: ["ubuntu-latest", "macos-latest", "windows-latest"]
         python-version: [3.8, 3.9, "3.10", "3.11"]
       fail-fast: false
     name: CPython ${{ matrix.python-version }}-${{ matrix.os }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
@@ -71,6 +71,7 @@ jobs:
           net start MongoDB
       - name: Install libbson
         run: |
+          pip install packaging  # needed for mongo-c-driver-1.24.4/build/calc_release_version.py
           ./build-libbson.sh
       - name: Install Python dependencies
         run: |
@@ -96,7 +97,7 @@ jobs:
   docs:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Cache conda
         uses: actions/cache@v3
         env:
diff --git a/bindings/python/MANIFEST.in b/bindings/python/MANIFEST.in
@@ -15,6 +15,7 @@ graft pymongoarrow
 
 recursive-include test *
 recursive-exclude docs *
+recursive-exclude benchmarks *
 
 global-exclude *.cpp
 global-exclude *.dylib
diff --git a/bindings/python/asv.conf.json b/bindings/python/asv.conf.json
@@ -6,16 +6,9 @@
     "repo_subdir": "bindings/python",
     "branches": ["main"],
     "matrix": {
-         "req": {
-             "pyarrow": ["7.0.0"],
-             "pymongo": ["3.11", "4.1.1"],
-             "pandas": [],
-             "Cython": [],
-             "numpy": []
-         },
          "env": {
-             "N_DOCS": ["20000", "1000"],
-         },
+             "N_DOCS": ["20000", "1000"]
+         }
     },
-    "environment_type": "virtualenv",
+    "environment_type": "virtualenv"
 }
diff --git a/bindings/python/benchmarks/benchmarks.py b/bindings/python/benchmarks/benchmarks.py
@@ -53,7 +53,7 @@ class Insert(ABC):
     rounds = 1
 
     @abc.abstractmethod
-    def setup_cache(self):
+    def setup(self):
         raise NotImplementedError
 
     def time_insert_arrow(self):
@@ -94,7 +94,7 @@ class Read(ABC):
     rounds = 1
 
     @abc.abstractmethod
-    def setup_cache(self):
+    def setup(self):
         raise NotImplementedError
 
     # We need this because the naive methods don't always convert nested objects.
@@ -160,7 +160,7 @@ class ProfileReadArray(Read):
         }
     )
 
-    def setup_cache(self):
+    def setup(self):
         coll = db.benchmark
         coll.drop()
         base_dict = dict(
@@ -205,7 +205,7 @@ class ProfileReadDocument(Read):
         }
     )
 
-    def setup_cache(self):
+    def setup(self):
         coll = db.benchmark
         coll.drop()
         base_dict = dict(
@@ -247,7 +247,7 @@ class ProfileReadSmall(Read):
     schema = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()})
     dtypes = np.dtype(np.dtype([("x", np.int64), ("y", np.float64)]))
 
-    def setup_cache(self):
+    def setup(self):
         coll = db.benchmark
         coll.drop()
         base_dict = dict(
@@ -268,7 +268,7 @@ class ProfileReadLarge(Read):
     schema = Schema({k: pyarrow.float64() for k in large_doc_keys})
     dtypes = np.dtype([(k, np.float64) for k in large_doc_keys])
 
-    def setup_cache(self):
+    def setup(self):
         coll = db.benchmark
         coll.drop()
 
@@ -284,7 +284,7 @@ class ProfileReadExtensionSmall(Read):
     schema = Schema({"x": Decimal128Type(), "y": BinaryType(10)})
     dtypes = np.dtype(np.dtype([("x", np.object_), ("y", np.object_)]))
 
-    def setup_cache(self):
+    def setup(self):
         coll = db.benchmark
         coll.drop()
         base_dict = dict(
@@ -299,13 +299,20 @@ def setup_cache(self):
             % (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict))
         )
 
+    # This must be skipped because arrow can't read the Decimal128Type
+    def time_conventional_arrow(self):
+        pass
+
+    def time_insert_conventional(self):
+        pass
+
 
 class ProfileReadExtensionLarge(Read):
     large_doc_keys = [f"{i}" for i in range(LARGE_DOC_SIZE)]
     schema = Schema({k: Decimal128Type() for k in large_doc_keys})
     dtypes = np.dtype([(k, np.object_) for k in large_doc_keys])
 
-    def setup_cache(self):
+    def setup(self):
         coll = db.benchmark
         coll.drop()
 
@@ -316,16 +323,20 @@ def setup_cache(self):
             % (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict))
         )
 
+    # This must be skipped because arrow can't read the Decimal128Type
+    def time_conventional_arrow(self):
+        pass
+
+    def time_insert_conventional(self):
+        pass
+
 
 class ProfileInsertSmall(Insert):
     large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)]
     schema = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()})
-    arrow_table = find_arrow_all(db.benchmark, {}, schema=schema)
-    pandas_table = find_pandas_all(db.benchmark, {}, schema=schema)
-    numpy_arrays = find_numpy_all(db.benchmark, {}, schema=schema)
     dtypes = np.dtype([("x", np.int64), ("y", np.float64)])
 
-    def setup_cache(self):
+    def setup(self):
         coll = db.benchmark
         coll.drop()
         base_dict = dict([("x", 1), ("y", math.pi)])
@@ -334,17 +345,17 @@ def setup_cache(self):
             "%d docs, %dk each with %d keys"
             % (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict))
         )
+        self.arrow_table = find_arrow_all(db.benchmark, {}, schema=self.schema)
+        self.pandas_table = find_pandas_all(db.benchmark, {}, schema=self.schema)
+        self.numpy_arrays = find_numpy_all(db.benchmark, {}, schema=self.schema)
 
 
 class ProfileInsertLarge(Insert):
     large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)]
     schema = Schema({k: pyarrow.float64() for k in large_doc_keys})
-    arrow_table = find_arrow_all(db.benchmark, {}, schema=schema)
-    pandas_table = find_pandas_all(db.benchmark, {}, schema=schema)
-    numpy_arrays = find_numpy_all(db.benchmark, {}, schema=schema)
     dtypes = np.dtype([(k, np.float64) for k in large_doc_keys])
 
-    def setup_cache(self):
+    def setup(self):
         coll = db.benchmark
         coll.drop()
         base_dict = dict([(k, math.pi) for k in self.large_doc_keys])
@@ -353,3 +364,6 @@ def setup_cache(self):
             "%d docs, %dk each with %d keys"
             % (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict))
         )
+        self.arrow_table = find_arrow_all(db.benchmark, {}, schema=self.schema)
+        self.pandas_table = find_pandas_all(db.benchmark, {}, schema=self.schema)
+        self.numpy_arrays = find_numpy_all(db.benchmark, {}, schema=self.schema)
diff --git a/bindings/python/build-libbson.sh b/bindings/python/build-libbson.sh
@@ -5,7 +5,7 @@ set -o errexit
 
 # Version of libbson to build
 # Keep in sync with pymongoarrow.version._MIN_LIBBSON_VERSION
-LIBBSON_VERSION=${LIBBSON_VERSION:-"1.21.1"}
+LIBBSON_VERSION=${LIBBSON_VERSION:-"1.23.1"}
 if [ -z "$LIBBSON_VERSION" ]
 then
   echo "Did not provide a libbson revision ID to build"
diff --git a/bindings/python/docs/source/developer/installation.rst b/bindings/python/docs/source/developer/installation.rst
@@ -12,7 +12,7 @@ Command Line Tools. Additionally, you need CMake and pkg-config::
   $ brew install cmake
   $ brew install pkg-config
 
-On Linux, you require gcc 4.8, CMake and pkg-config.
+On Linux, installation requires gcc 12, CMake and pkg-config.
 
 Windows is not yet supported.
 
diff --git a/bindings/python/docs/source/installation.rst b/bindings/python/docs/source/installation.rst
@@ -11,7 +11,7 @@ PyMongoArrow is regularly built and tested on macOS and Linux
 Python Compatibility
 --------------------
 
-PyMongoArrow is currently compatible with CPython 3.8, 3.9, 3.10 and 3.11.
+PyMongoArrow is currently compatible with CPython 3.8, 3.9, 3.10, and 3.11.
 
 Using Pip
 ---------
@@ -56,20 +56,20 @@ Dependencies
 
 PyMongoArrow requires:
 
-- PyMongo>=3.11 (PyMongo 4.0 is supported from 0.2)
-- PyArrow>=7,<7.1
+- PyMongo>=4.4
+- PyArrow>=13,<13.1
 
 To use PyMongoArrow with a PyMongo feature that requires an optional
 dependency, users must install PyMongo with the given dependency manually.
 
 .. note:: PyMongo's optional dependencies are detailed
    `here <https://pymongo.readthedocs.io/en/stable/installation.html#dependencies>`_.
 
-For example, to use PyMongoArrow with MongoDB Atlas' ``mongodb+srv://`` URIs
-users must install PyMongo with the ``srv`` extra in addition to installing
+For example, to use PyMongoArrow with Client-Side Field Level Encryption
+users must install PyMongo with the ``encryption`` extra in addition to installing
 PyMongoArrow::
 
-  $ python -m pip install 'pymongo[srv]' pymongoarrow
+  $ python -m pip install 'pymongo[encryption]' pymongoarrow
 
 Applications intending to use PyMongoArrow APIs that return query result sets
 as :class:`pandas.DataFrame` instances (e.g. :meth:`~pymongoarrow.api.find_pandas_all`)
diff --git a/bindings/python/pymongoarrow/version.py b/bindings/python/pymongoarrow/version.py
@@ -14,4 +14,4 @@
 
 __version__ = "1.1.0.dev0"
 
-_MIN_LIBBSON_VERSION = "1.21.0"
+_MIN_LIBBSON_VERSION = "1.23.1"
diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml
@@ -3,10 +3,70 @@ requires = [
     "setuptools>=47.9",
     "wheel>=0.37",
     "cython>=0.29",
-    # Must be kept in sync with the `install_requires` in `setup.cfg`
-    "pyarrow>=12.0,<12.1.0",
+    # Must be kept in sync with "project.dependencies" below.
+    "pyarrow>=13.0,<13.1.0",
 ]
 
+[project]
+name = "pymongoarrow"
+description = '"Tools for using NumPy, Pandas and PyArrow with MongoDB"'
+license = {text = "Apache License, Version 2.0"}
+authors = [{name = "Prashant Mital"}]
+maintainers = [{name = "MongoDB"}, {name = "Inc."}]
+keywords = ["mongo", "mongodb", "pymongo", "arrow", "bson", "numpy", "pandas"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: MacOS :: MacOS X",
+    "Operating System :: POSIX",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Topic :: Database",
+]
+requires-python = ">=3.8"
+dependencies = [
+    # Must be kept in sync with "build_sytem.requires" above.
+    "pyarrow >=13.0,<13.1",
+    "pymongo >=4.4,<5",
+    "pandas >=1.3.5,<3",
+]
+dynamic = ["version"]
+
+[project.readme]
+file = "README.rst"
+content-type = "text/x-rst"
+
+[project.urls]
+Homepage = "https://github.com/mongodb-labs/mongo-arrow/tree/main/bindings/python"
+
+[project.optional-dependencies]
+test = ["pytz", "pytest"]
+
+[tool.setuptools]
+zip-safe = false
+include-package-data = true
+platforms = ["Linux", "Mac OS X"]
+
+[tool.setuptools.package-data]
+pymongoarrow = ["*.pxd", "*.pyx", "*.pyi", "*.so.*", "*.dylib", "*.dll", "*.pyd"]
+
+[tool.setuptools.packages.find]
+exclude = [
+    "test",
+    "docs",
+]
+namespaces = false
+
+[tool.setuptools.dynamic]
+version = {attr = "pymongoarrow.version.__version__"}
+
 [tool.cibuildwheel]
 skip = "pp* *-manylinux_i686 *_ppc64le *_s390x *-musllinux*"
 before-build = "bash ./cibw_before_build.sh"
@@ -20,7 +80,7 @@ LIBBSON_INSTALL_DIR = "./libbson"
 
 [tool.cibuildwheel.linux]
 archs = "x86_64 aarch64"
-manylinux-x86_64-image = "manylinux2014"
+manylinux-x86_64-image = "manylinux_2_28"
 repair-wheel-command = [
     "pip install \"auditwheel>=5,<6\"",
     "python addtags.py {wheel} {dest_dir}"
@@ -29,3 +89,11 @@ repair-wheel-command = [
 [tool.cibuildwheel.macos]
 archs = "x86_64 arm64"
 test-skip = "*arm64"
+
+[tool.pytest.ini_options]
+testpaths = ["test"]
+addopts = "-ra --maxfail=10 --durations=5"
+faulthandler_timeout=1500
+filterwarnings = [
+  "error"
+]
diff --git a/bindings/python/setup.cfg b/bindings/python/setup.cfg
diff --git a/bindings/python/setup.py b/bindings/python/setup.py
diff --git a/bindings/python/test/test_numpy.py b/bindings/python/test/test_numpy.py
diff --git a/bindings/python/test/test_pandas.py b/bindings/python/test/test_pandas.py

Original file line number	Diff line number	Diff line change
`@@ -14,4 +14,4 @@`
`14`	`14`
`15`	`15`	`__version__ = "1.1.0.dev0"`
`16`	`16`
`17`		`-_MIN_LIBBSON_VERSION = "1.21.0"`
	`17`	`+_MIN_LIBBSON_VERSION = "1.23.1"`