From 489944f62c4a878b249f494d14a1111dbc0654b3 Mon Sep 17 00:00:00 2001 From: vanshaj2023 Date: Sun, 22 Feb 2026 01:22:38 +0530 Subject: [PATCH 1/4] GH-49273: [Python] Move stub docstring script to _build_utils with graceful degradation --- python/MANIFEST.in | 1 - python/pyarrow/_build_utils/__init__.py | 16 ++ .../_build_utils/update_stub_docstrings.py | 230 ++++++++++++++++++ python/pyproject.toml | 5 +- python/scripts/update_stub_docstrings.py | 205 +--------------- python/setup.py | 25 +- 6 files changed, 266 insertions(+), 216 deletions(-) create mode 100644 python/pyarrow/_build_utils/__init__.py create mode 100644 python/pyarrow/_build_utils/update_stub_docstrings.py diff --git a/python/MANIFEST.in b/python/MANIFEST.in index c37446c64fe4..5896f1c44a13 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -5,7 +5,6 @@ include ../NOTICE.txt global-include CMakeLists.txt graft pyarrow graft pyarrow-stubs -include scripts/update_stub_docstrings.py graft cmake_modules global-exclude *.so diff --git a/python/pyarrow/_build_utils/__init__.py b/python/pyarrow/_build_utils/__init__.py new file mode 100644 index 000000000000..13a83393a912 --- /dev/null +++ b/python/pyarrow/_build_utils/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/pyarrow/_build_utils/update_stub_docstrings.py b/python/pyarrow/_build_utils/update_stub_docstrings.py new file mode 100644 index 000000000000..340c75679729 --- /dev/null +++ b/python/pyarrow/_build_utils/update_stub_docstrings.py @@ -0,0 +1,230 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Extract docstrings from pyarrow runtime and insert them into stub files. + +Usage (from python/ directory with pyarrow built): + python scripts/update_stub_docstrings.py pyarrow-stubs +""" + +import argparse +import importlib +import inspect +import sys +from pathlib import Path +from textwrap import indent + +import libcst +from libcst import matchers as m + + +def _resolve_object(module, path): + """Resolve an object by dotted path from a module.""" + if not path: + return module, None, module.__name__ + + parts = path.split(".") + parent = None + obj = module + + for part in parts: + parent = obj + try: + obj = getattr(obj, part) + except AttributeError: + try: + obj = vars(parent).get(part) + if obj is not None: + continue + except TypeError: + pass + return None, None, None + + return obj, parent, getattr(obj, "__name__", parts[-1]) + + +def _get_docstring(name, module, indentation): + """Extract and format a docstring for insertion into a stub file.""" + obj, parent, obj_name = _resolve_object(module, name) + if obj is None: + print(f"{name} not found in {module.__name__}") + return None + + docstring = inspect.getdoc(obj) + if not docstring: + return None + + # Remove signature prefix + parent_name = getattr(parent, "__name__", None) if parent else None + if docstring.startswith(obj_name) or ( + parent_name and docstring.startswith(f"{parent_name}.{obj_name}") + ): + docstring = "\n".join(docstring.splitlines()[2:]) + + # Skip empty docstrings + if not docstring.strip(): + return None + + prefix = " " * indentation + return '"""\n' + indent(docstring + '\n"""', prefix) + + +class DocstringInserter(libcst.CSTTransformer): + """CST transformer that inserts docstrings into stub file nodes.""" + + def __init__(self, module, namespace): + self.module = module + self.base_namespace = namespace + self.stack = [] + self.indentation = 0 + + def _full_name(self): + name = ".".join(self.stack) + return f"{self.base_namespace}.{name}" if self.base_namespace else name + + def leave_Module(self, original_node, updated_node): + new_body = [] + clone_matcher = m.SimpleStatementLine( + body=[m.Assign(value=m.Call(func=m.Name(value="_clone_signature"))), + m.ZeroOrMore()] + ) + for stmt in updated_node.body: + new_body.append(stmt) + if m.matches(stmt, clone_matcher): + name = stmt.body[0].targets[0].target.value + if self.base_namespace: + name = f"{self.base_namespace}.{name}" + docstring = _get_docstring(name, self.module, 0) + if docstring: + new_body.append(libcst.SimpleStatementLine( + body=[libcst.Expr(value=libcst.SimpleString(docstring))])) + return updated_node.with_changes(body=new_body) + + def visit_ClassDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_ClassDef(self, original_node, updated_node): + name = self._full_name() + docstring = _get_docstring(name, self.module, self.indentation) + + if docstring: + ellipsis_class = m.ClassDef(body=m.IndentedBlock(body=[ + m.SimpleStatementLine(body=[ + m.Expr(m.Ellipsis()), m.ZeroOrMore()]), m.ZeroOrMore()])) + func_class = m.ClassDef(body=m.IndentedBlock( + body=[m.FunctionDef(), m.ZeroOrMore()])) + + if m.matches(updated_node, ellipsis_class): + updated_node = updated_node.deep_replace( + updated_node.body.body[0].body[0].value, + libcst.SimpleString(value=docstring)) + elif m.matches(updated_node, func_class): + docstring_stmt = libcst.SimpleStatementLine( + body=[libcst.Expr(value=libcst.SimpleString(value=docstring))]) + updated_node = updated_node.with_changes( + body=updated_node.body.with_changes( + body=[docstring_stmt] + list(updated_node.body.body))) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + def visit_FunctionDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_FunctionDef(self, original_node, updated_node): + name = self._full_name() + ellipsis_func = m.FunctionDef( + body=m.SimpleStatementSuite(body=[m.Expr(m.Ellipsis())])) + + if m.matches(original_node, ellipsis_func): + docstring = _get_docstring(name, self.module, self.indentation) + if docstring: + docstring_stmt = libcst.SimpleStatementLine( + body=[libcst.Expr(value=libcst.SimpleString(value=docstring))]) + updated_node = updated_node.with_changes( + body=libcst.IndentedBlock(body=[docstring_stmt])) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + +LIB_MODULES = {"array", "builder", "compat", "config", "device", "error", "io", + "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", "_types"} + + +def add_docstrings_to_stubs(stubs_dir): + """Update all stub files in stubs_dir with docstrings from pyarrow runtime.""" + stubs_dir = Path(stubs_dir) + print(f"Updating stub docstrings in: {stubs_dir}") + + pyarrow = importlib.import_module("pyarrow") + + for stub_file in stubs_dir.rglob('*.pyi'): + if stub_file.name == "_stubs_typing.pyi": + continue + + module_name = stub_file.stem + if module_name in LIB_MODULES: + namespace = "lib" + elif stub_file.parent.name in ("parquet", "interchange"): + namespace = f"{stub_file.parent.name}.{module_name}" + elif module_name == "__init__": + namespace = "" + else: + namespace = module_name + + print(f" {stub_file.name} -> {namespace or '(root)'}") + tree = libcst.parse_module(stub_file.read_text()) + modified = tree.visit(DocstringInserter(pyarrow, namespace)) + stub_file.write_text(modified.code) + + +def add_docstrings_from_build(stubs_dir, build_lib): + """ + Entry point for setup.py: update docstrings using pyarrow from build directory. + + During the build process, pyarrow is not installed in the system Python. + We need to temporarily add the build directory to sys.path so we can + import pyarrow and extract docstrings from it. + """ + stubs_dir, build_lib = Path(stubs_dir), Path(build_lib) + + sys.path.insert(0, str(build_lib)) + try: + add_docstrings_to_stubs(stubs_dir) + finally: + sys.path.pop(0) + + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("stubs_dir", type=Path, + help="Path to pyarrow-stubs folder") + args = parser.parse_args() + + python_dir = Path(__file__).resolve().parent.parent.parent + sys.path.insert(0, str(python_dir)) + add_docstrings_to_stubs(args.stubs_dir.resolve()) + + +if __name__ == "__main__": + main() diff --git a/python/pyproject.toml b/python/pyproject.toml index 217dba81b873..cecbefd97e02 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -18,8 +18,8 @@ [build-system] requires = [ "cython >= 3.1", - # Needed for build-time stub docstring extraction - "libcst>=1.8.6", + # Optional: enables stub docstring injection during build + # "libcst>=1.8.6", "numpy>=1.25", # configuring setuptools_scm in pyproject.toml requires # versions released after 2022 @@ -87,6 +87,7 @@ include-package-data=true [tool.setuptools.packages.find] include = ["pyarrow"] +exclude = ["pyarrow._build_utils", "pyarrow._build_utils.*"] namespaces = false [tool.setuptools.package-data] diff --git a/python/scripts/update_stub_docstrings.py b/python/scripts/update_stub_docstrings.py index 5fd24014a024..72a89fca0fd0 100644 --- a/python/scripts/update_stub_docstrings.py +++ b/python/scripts/update_stub_docstrings.py @@ -16,213 +16,16 @@ # under the License. """ -Extract docstrings from pyarrow runtime and insert them into stub files. +CLI wrapper for stub docstring injection. Usage (from python/ directory with pyarrow built): python scripts/update_stub_docstrings.py pyarrow-stubs """ -import argparse -import importlib -import inspect import sys from pathlib import Path -from textwrap import indent - -import libcst -from libcst import matchers as m - - -def _resolve_object(module, path): - """Resolve an object by dotted path from a module.""" - if not path: - return module, None, module.__name__ - - parts = path.split(".") - parent = None - obj = module - - for part in parts: - parent = obj - try: - obj = getattr(obj, part) - except AttributeError: - try: - obj = vars(parent).get(part) - if obj is not None: - continue - except TypeError: - pass - return None, None, None - - return obj, parent, getattr(obj, "__name__", parts[-1]) - - -def _get_docstring(name, module, indentation): - """Extract and format a docstring for insertion into a stub file.""" - obj, parent, obj_name = _resolve_object(module, name) - if obj is None: - print(f"{name} not found in {module.__name__}") - return None - - docstring = inspect.getdoc(obj) - if not docstring: - return None - - # Remove signature prefix - parent_name = getattr(parent, "__name__", None) if parent else None - if docstring.startswith(obj_name) or ( - parent_name and docstring.startswith(f"{parent_name}.{obj_name}") - ): - docstring = "\n".join(docstring.splitlines()[2:]) - - # Skip empty docstrings - if not docstring.strip(): - return None - - prefix = " " * indentation - return '"""\n' + indent(docstring + '\n"""', prefix) - - -class DocstringInserter(libcst.CSTTransformer): - """CST transformer that inserts docstrings into stub file nodes.""" - - def __init__(self, module, namespace): - self.module = module - self.base_namespace = namespace - self.stack = [] - self.indentation = 0 - - def _full_name(self): - name = ".".join(self.stack) - return f"{self.base_namespace}.{name}" if self.base_namespace else name - - def leave_Module(self, original_node, updated_node): - new_body = [] - clone_matcher = m.SimpleStatementLine( - body=[m.Assign(value=m.Call(func=m.Name(value="_clone_signature"))), - m.ZeroOrMore()] - ) - for stmt in updated_node.body: - new_body.append(stmt) - if m.matches(stmt, clone_matcher): - name = stmt.body[0].targets[0].target.value - if self.base_namespace: - name = f"{self.base_namespace}.{name}" - docstring = _get_docstring(name, self.module, 0) - if docstring: - new_body.append(libcst.SimpleStatementLine( - body=[libcst.Expr(value=libcst.SimpleString(docstring))])) - return updated_node.with_changes(body=new_body) - - def visit_ClassDef(self, node): - self.stack.append(node.name.value) - self.indentation += 1 - - def leave_ClassDef(self, original_node, updated_node): - name = self._full_name() - docstring = _get_docstring(name, self.module, self.indentation) - - if docstring: - ellipsis_class = m.ClassDef(body=m.IndentedBlock(body=[ - m.SimpleStatementLine(body=[ - m.Expr(m.Ellipsis()), m.ZeroOrMore()]), m.ZeroOrMore()])) - func_class = m.ClassDef(body=m.IndentedBlock( - body=[m.FunctionDef(), m.ZeroOrMore()])) - - if m.matches(updated_node, ellipsis_class): - updated_node = updated_node.deep_replace( - updated_node.body.body[0].body[0].value, - libcst.SimpleString(value=docstring)) - elif m.matches(updated_node, func_class): - docstring_stmt = libcst.SimpleStatementLine( - body=[libcst.Expr(value=libcst.SimpleString(value=docstring))]) - updated_node = updated_node.with_changes( - body=updated_node.body.with_changes( - body=[docstring_stmt] + list(updated_node.body.body))) - - self.stack.pop() - self.indentation -= 1 - return updated_node - - def visit_FunctionDef(self, node): - self.stack.append(node.name.value) - self.indentation += 1 - - def leave_FunctionDef(self, original_node, updated_node): - name = self._full_name() - ellipsis_func = m.FunctionDef( - body=m.SimpleStatementSuite(body=[m.Expr(m.Ellipsis())])) - - if m.matches(original_node, ellipsis_func): - docstring = _get_docstring(name, self.module, self.indentation) - if docstring: - docstring_stmt = libcst.SimpleStatementLine( - body=[libcst.Expr(value=libcst.SimpleString(value=docstring))]) - updated_node = updated_node.with_changes( - body=libcst.IndentedBlock(body=[docstring_stmt])) - - self.stack.pop() - self.indentation -= 1 - return updated_node - - -LIB_MODULES = {"array", "builder", "compat", "config", "device", "error", "io", - "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", "_types"} - - -def add_docstrings_to_stubs(stubs_dir): - """Update all stub files in stubs_dir with docstrings from pyarrow runtime.""" - stubs_dir = Path(stubs_dir) - print(f"Updating stub docstrings in: {stubs_dir}") - - pyarrow = importlib.import_module("pyarrow") - - for stub_file in stubs_dir.rglob('*.pyi'): - if stub_file.name == "_stubs_typing.pyi": - continue - - module_name = stub_file.stem - if module_name in LIB_MODULES: - namespace = "lib" - elif stub_file.parent.name in ("parquet", "interchange"): - namespace = f"{stub_file.parent.name}.{module_name}" - elif module_name == "__init__": - namespace = "" - else: - namespace = module_name - - print(f" {stub_file.name} -> {namespace or '(root)'}") - tree = libcst.parse_module(stub_file.read_text()) - modified = tree.visit(DocstringInserter(pyarrow, namespace)) - stub_file.write_text(modified.code) - - -def add_docstrings_from_build(stubs_dir, build_lib): - """ - Entry point for setup.py: update docstrings using pyarrow from build directory. - - During the build process, pyarrow is not installed in the system Python. - We need to temporarily add the build directory to sys.path so we can - import pyarrow and extract docstrings from it. - """ - stubs_dir, build_lib = Path(stubs_dir), Path(build_lib) - - sys.path.insert(0, str(build_lib)) - try: - add_docstrings_to_stubs(stubs_dir) - finally: - sys.path.pop(0) - if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("stubs_dir", type=Path, help="Path to pyarrow-stubs folder") - args = parser.parse_args() - - # Add the directory containing this script's parent (python/) to sys.path - # so pyarrow can be imported when running from the python/ directory - script_dir = Path(__file__).resolve().parent - python_dir = script_dir.parent - sys.path.insert(0, str(python_dir)) - add_docstrings_to_stubs(args.stubs_dir.resolve()) + sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + from pyarrow._build_utils.update_stub_docstrings import main + main() diff --git a/python/setup.py b/python/setup.py index 4f2bf7585e13..23b468db7c38 100755 --- a/python/setup.py +++ b/python/setup.py @@ -133,20 +133,21 @@ def _update_stubs(self): build_cmd = self.get_finalized_command('build') build_lib = os.path.abspath(build_cmd.build_lib) - # Copy clean stubs to build directory first self._copy_stubs(stubs_dir, build_lib) - # Inject docstrings into the build copies (not the source stubs). - # We pass build_lib as stubs_dir since it mirrors the pyarrow-stubs/ - # directory structure (both contain a pyarrow/ subdirectory with .pyi - # files), so the namespace resolution logic works identically. - import importlib.util - spec = importlib.util.spec_from_file_location( - "update_stub_docstrings", - pjoin(setup_dir, 'scripts', 'update_stub_docstrings.py')) - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - mod.add_docstrings_from_build(build_lib, build_lib) + if os.environ.get('PYARROW_SKIP_STUB_DOCSTRINGS', '0') == '1': + print("-- Skipping stub docstring injection " + "(PYARROW_SKIP_STUB_DOCSTRINGS=1)") + return + + # Inject docstrings from the built pyarrow into the stub copies. + try: + from pyarrow._build_utils.update_stub_docstrings import ( + add_docstrings_from_build, + ) + add_docstrings_from_build(build_lib, build_lib) + except ImportError as e: + print(f"-- Skipping stub docstring injection ({e})") def _copy_stubs(self, stubs_dir, build_lib): """Copy .pyi stub files to the build directory.""" From b2d48415e57c87efa47e7fa5aadd85dda05bac75 Mon Sep 17 00:00:00 2001 From: vanshaj2023 Date: Thu, 12 Mar 2026 02:16:11 +0530 Subject: [PATCH 2/4] changes --- python/pyproject.toml | 36 ++-- python/setup.py | 481 ------------------------------------------ 2 files changed, 19 insertions(+), 498 deletions(-) delete mode 100755 python/setup.py diff --git a/python/pyproject.toml b/python/pyproject.toml index cecbefd97e02..5843b5937795 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -17,16 +17,17 @@ [build-system] requires = [ + "scikit-build-core", "cython >= 3.1", - # Optional: enables stub docstring injection during build - # "libcst>=1.8.6", + # Needed for build-time stub docstring extraction + "libcst>=1.8.6", "numpy>=1.25", - # configuring setuptools_scm in pyproject.toml requires - # versions released after 2022 "setuptools_scm[toml]>=8", - "setuptools>=77", ] -build-backend = "setuptools.build_meta" +# We use a really simple build backend wrapper over scikit-build-core +# to solve licenses to work around links not being included in sdists. +build-backend = "_build_backend" +backend-path = ["."] [project] name = "pyarrow" @@ -81,17 +82,18 @@ exclude = [ '\._.*$', ] -[tool.setuptools] -zip-safe=false -include-package-data=true +[tool.scikit-build] +cmake.build-type = "Release" +metadata.version.provider = "scikit_build_core.metadata.setuptools_scm" +sdist.include = ["pyarrow/_generated_version.py", "cmake_modules/"] +wheel.packages = ["pyarrow"] +wheel.install-dir = "pyarrow" -[tool.setuptools.packages.find] -include = ["pyarrow"] -exclude = ["pyarrow._build_utils", "pyarrow._build_utils.*"] -namespaces = false - -[tool.setuptools.package-data] -pyarrow = ["*.pxd", "*.pyi", "*.pyx", "includes/*.pxd", "py.typed"] +[tool.scikit-build.cmake.define] +PYARROW_BUNDLE_ARROW_CPP = {env = "PYARROW_BUNDLE_ARROW_CPP", default = "OFF"} +PYARROW_BUNDLE_CYTHON_CPP = {env = "PYARROW_BUNDLE_CYTHON_CPP", default = "OFF"} +PYARROW_GENERATE_COVERAGE = {env = "PYARROW_GENERATE_COVERAGE", default = "OFF"} +PYARROW_CXXFLAGS = {env = "PYARROW_CXXFLAGS", default = ""} [tool.setuptools_scm] root = '..' @@ -134,4 +136,4 @@ exclude = [ "benchmarks", "examples", "scripts", -] +] \ No newline at end of file diff --git a/python/setup.py b/python/setup.py deleted file mode 100755 index 23b468db7c38..000000000000 --- a/python/setup.py +++ /dev/null @@ -1,481 +0,0 @@ -#!/usr/bin/env python - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import contextlib -import os -import os.path -from os.path import join as pjoin -import re -import shlex -import shutil -import sys -import warnings - -if sys.version_info >= (3, 10): - import sysconfig -else: - # Get correct EXT_SUFFIX on Windows (https://bugs.python.org/issue39825) - from distutils import sysconfig - -from setuptools import setup, Extension, Distribution -from setuptools.command.sdist import sdist - -from Cython.Distutils import build_ext as _build_ext -import Cython - -# Check if we're running 64-bit Python -is_64_bit = sys.maxsize > 2**32 - -# We can't use sys.platform in a cross-compiling situation -# as here it may be set to the host not target platform -is_emscripten = ( - sysconfig.get_config_var("SOABI") - and sysconfig.get_config_var("SOABI").find("emscripten") != -1 -) - - -if Cython.__version__ < '3.1': - raise Exception( - 'Please update your Cython version. Supported Cython >= 3.1') - -setup_dir = os.path.abspath(os.path.dirname(__file__)) - -ext_suffix = sysconfig.get_config_var('EXT_SUFFIX') - - -@contextlib.contextmanager -def changed_dir(dirname): - oldcwd = os.getcwd() - os.chdir(dirname) - try: - yield - finally: - os.chdir(oldcwd) - - -def strtobool(val): - """Convert a string representation of truth to true (1) or false (0). - - True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values - are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if - 'val' is anything else. - """ - # Copied from distutils - val = val.lower() - if val in ('y', 'yes', 't', 'true', 'on', '1'): - return 1 - elif val in ('n', 'no', 'f', 'false', 'off', '0'): - return 0 - else: - raise ValueError("invalid truth value %r" % (val,)) - - -MSG_DEPR_SETUP_BUILD_FLAGS = """ - !! - - *********************************************************************** - The '{}' flag is being passed to setup.py, but this is - deprecated. - - If a certain component is available in Arrow C++, it will automatically - be enabled for the PyArrow build as well. If you want to force the - build of a certain component, you can still use the - PYARROW_WITH_$COMPONENT environment variable. - *********************************************************************** - - !! -""" - - -class build_ext(_build_ext): - _found_names = () - - def build_extensions(self): - import numpy - numpy_incl = numpy.get_include() - - self.extensions = [ext for ext in self.extensions - if ext.name != '__dummy__'] - - for ext in self.extensions: - if (hasattr(ext, 'include_dirs') and - numpy_incl not in ext.include_dirs): - ext.include_dirs.append(numpy_incl) - _build_ext.build_extensions(self) - - def run(self): - self._run_cmake() - self._update_stubs() - _build_ext.run(self) - - def _update_stubs(self): - """Copy stubs to build directory, then inject docstrings into the copies.""" - stubs_dir = pjoin(setup_dir, 'pyarrow-stubs') - if not os.path.exists(stubs_dir): - return - - build_cmd = self.get_finalized_command('build') - build_lib = os.path.abspath(build_cmd.build_lib) - - self._copy_stubs(stubs_dir, build_lib) - - if os.environ.get('PYARROW_SKIP_STUB_DOCSTRINGS', '0') == '1': - print("-- Skipping stub docstring injection " - "(PYARROW_SKIP_STUB_DOCSTRINGS=1)") - return - - # Inject docstrings from the built pyarrow into the stub copies. - try: - from pyarrow._build_utils.update_stub_docstrings import ( - add_docstrings_from_build, - ) - add_docstrings_from_build(build_lib, build_lib) - except ImportError as e: - print(f"-- Skipping stub docstring injection ({e})") - - def _copy_stubs(self, stubs_dir, build_lib): - """Copy .pyi stub files to the build directory.""" - src_dir = pjoin(stubs_dir, 'pyarrow') - dest_dir = pjoin(build_lib, 'pyarrow') - - if not os.path.exists(src_dir): - return - - print(f"-- Copying stubs: {src_dir} -> {dest_dir}") - for root, dirs, files in os.walk(src_dir): - for fname in files: - if fname.endswith('.pyi'): - src = pjoin(root, fname) - rel_path = os.path.relpath(src, src_dir) - dest = pjoin(dest_dir, rel_path) - os.makedirs(os.path.dirname(dest), exist_ok=True) - shutil.copy2(src, dest) - - # adapted from cmake_build_ext in dynd-python - # github.com/libdynd/dynd-python - - description = "Build the C-extensions for arrow" - user_options = ([('cmake-generator=', None, 'CMake generator'), - ('extra-cmake-args=', None, 'extra arguments for CMake'), - ('build-type=', None, - 'build type (debug or release), default release'), - ('boost-namespace=', None, - 'namespace of boost (default: boost)'), - ('with-cuda', None, 'build the Cuda extension'), - ('with-flight', None, 'build the Flight extension'), - ('with-substrait', None, 'build the Substrait extension'), - ('with-acero', None, 'build the Acero Engine extension'), - ('with-dataset', None, 'build the Dataset extension'), - ('with-parquet', None, 'build the Parquet extension'), - ('with-parquet-encryption', None, - 'build the Parquet encryption extension'), - ('with-azure', None, - 'build the Azure Blob Storage extension'), - ('with-gcs', None, - 'build the Google Cloud Storage (GCS) extension'), - ('with-s3', None, 'build the Amazon S3 extension'), - ('with-static-parquet', None, 'link parquet statically'), - ('with-static-boost', None, 'link boost statically'), - ('with-orc', None, 'build the ORC extension'), - ('with-gandiva', None, 'build the Gandiva extension'), - ('generate-coverage', None, - 'enable Cython code coverage'), - ('bundle-boost', None, - 'bundle the (shared) Boost libraries'), - ('bundle-cython-cpp', None, - 'bundle generated Cython C++ code ' - '(used for code coverage)'), - ('bundle-arrow-cpp', None, - 'bundle the Arrow C++ libraries'), - ('bundle-arrow-cpp-headers', None, - 'bundle the Arrow C++ headers')] + - _build_ext.user_options) - - def initialize_options(self): - _build_ext.initialize_options(self) - self.cmake_generator = os.environ.get('PYARROW_CMAKE_GENERATOR') - if not self.cmake_generator and sys.platform == 'win32': - self.cmake_generator = 'Visual Studio 15 2017 Win64' - self.extra_cmake_args = os.environ.get('PYARROW_CMAKE_OPTIONS', '') - self.build_type = os.environ.get('PYARROW_BUILD_TYPE', - 'release').lower() - - self.cmake_cxxflags = os.environ.get('PYARROW_CXXFLAGS', '') - - if sys.platform == 'win32': - # Cannot do debug builds in Windows unless Python itself is a debug - # build - if not hasattr(sys, 'gettotalrefcount'): - self.build_type = 'release' - - self.with_azure = None - self.with_gcs = None - self.with_s3 = None - self.with_hdfs = None - self.with_cuda = None - self.with_substrait = None - self.with_flight = None - self.with_acero = None - self.with_dataset = None - self.with_parquet = None - self.with_parquet_encryption = None - self.with_orc = None - self.with_gandiva = None - - self.generate_coverage = strtobool( - os.environ.get('PYARROW_GENERATE_COVERAGE', '0')) - self.bundle_arrow_cpp = strtobool( - os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0')) - self.bundle_cython_cpp = strtobool( - os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0')) - - CYTHON_MODULE_NAMES = [ - 'lib', - '_fs', - '_csv', - '_json', - '_compute', - '_cuda', - '_flight', - '_dataset', - '_dataset_orc', - '_dataset_parquet', - '_acero', - '_feather', - '_parquet', - '_parquet_encryption', - '_pyarrow_cpp_tests', - '_orc', - '_azurefs', - '_gcsfs', - '_s3fs', - '_substrait', - '_hdfs', - 'gandiva'] - - def _run_cmake(self): - # check if build_type is correctly passed / set - if self.build_type.lower() not in ('release', 'debug', - 'relwithdebinfo'): - raise ValueError("--build-type (or PYARROW_BUILD_TYPE) needs to " - "be 'release', 'debug' or 'relwithdebinfo'") - - # The directory containing this setup.py - source = os.path.dirname(os.path.abspath(__file__)) - - # The staging directory for the module being built - build_cmd = self.get_finalized_command('build') - saved_cwd = os.getcwd() - build_temp = pjoin(saved_cwd, build_cmd.build_temp) - build_lib = pjoin(saved_cwd, build_cmd.build_lib) - - if not os.path.isdir(build_temp): - self.mkpath(build_temp) - - if self.inplace: - # a bit hacky - build_lib = saved_cwd - - install_prefix = pjoin(build_lib, "pyarrow") - - # Change to the build directory - with changed_dir(build_temp): - # Detect if we built elsewhere - if os.path.isfile('CMakeCache.txt'): - cachefile = open('CMakeCache.txt', 'r') - cachedir = re.search('CMAKE_CACHEFILE_DIR:INTERNAL=(.*)', - cachefile.read()).group(1) - cachefile.close() - if (cachedir != build_temp): - build_base = pjoin(saved_cwd, build_cmd.build_base) - print(f"-- Skipping build. Temp build {build_temp} does " - f"not match cached dir {cachedir}") - print("---- For a clean build you might want to delete " - f"{build_base}.") - return - - cmake_options = [ - f'-DCMAKE_INSTALL_PREFIX={install_prefix}', - f'-DPYTHON_EXECUTABLE={sys.executable}', - f'-DPython3_EXECUTABLE={sys.executable}', - f'-DPYARROW_CXXFLAGS={self.cmake_cxxflags}', - ] - - def append_cmake_bool(value, varname): - cmake_options.append(f'-D{varname}={"on" if value else "off"}') - - def append_cmake_component(flag, varname): - # only pass this to cmake if the user pass the --with-component - # flag to setup.py build_ext - if flag is not None: - flag_name = ( - "--with-" - + varname.removeprefix("PYARROW_").lower().replace("_", "-")) - warnings.warn( - MSG_DEPR_SETUP_BUILD_FLAGS.format(flag_name), - UserWarning, stacklevel=2 - ) - append_cmake_bool(flag, varname) - - if self.cmake_generator: - cmake_options += ['-G', self.cmake_generator] - - append_cmake_component(self.with_cuda, 'PYARROW_CUDA') - append_cmake_component(self.with_substrait, 'PYARROW_SUBSTRAIT') - append_cmake_component(self.with_flight, 'PYARROW_FLIGHT') - append_cmake_component(self.with_gandiva, 'PYARROW_GANDIVA') - append_cmake_component(self.with_acero, 'PYARROW_ACERO') - append_cmake_component(self.with_dataset, 'PYARROW_DATASET') - append_cmake_component(self.with_orc, 'PYARROW_ORC') - append_cmake_component(self.with_parquet, 'PYARROW_PARQUET') - append_cmake_component(self.with_parquet_encryption, - 'PYARROW_PARQUET_ENCRYPTION') - append_cmake_component(self.with_azure, 'PYARROW_AZURE') - append_cmake_component(self.with_gcs, 'PYARROW_GCS') - append_cmake_component(self.with_s3, 'PYARROW_S3') - append_cmake_component(self.with_hdfs, 'PYARROW_HDFS') - - append_cmake_bool(self.bundle_arrow_cpp, - 'PYARROW_BUNDLE_ARROW_CPP') - append_cmake_bool(self.bundle_cython_cpp, - 'PYARROW_BUNDLE_CYTHON_CPP') - append_cmake_bool(self.generate_coverage, - 'PYARROW_GENERATE_COVERAGE') - - cmake_options.append( - f'-DCMAKE_BUILD_TYPE={self.build_type.lower()}') - - extra_cmake_args = shlex.split(self.extra_cmake_args) - - build_tool_args = [] - if sys.platform == 'win32': - if not is_64_bit: - raise RuntimeError('Not supported on 32-bit Windows') - else: - build_tool_args.append('--') - if os.environ.get('PYARROW_BUILD_VERBOSE', '0') == '1': - cmake_options.append('-DCMAKE_VERBOSE_MAKEFILE=ON') - parallel = os.environ.get('PYARROW_PARALLEL') - if parallel: - build_tool_args.append(f'-j{parallel}') - - # Generate the build files - if is_emscripten: - print("-- Running emcmake cmake for PyArrow on Emscripten") - self.spawn(['emcmake', 'cmake'] + extra_cmake_args + - cmake_options + [source]) - else: - print("-- Running cmake for PyArrow") - self.spawn(['cmake'] + extra_cmake_args + cmake_options + [source]) - - print("-- Finished cmake for PyArrow") - - print("-- Running cmake --build for PyArrow") - self.spawn(['cmake', '--build', '.', '--config', self.build_type] + - build_tool_args) - print("-- Finished cmake --build for PyArrow") - - print("-- Running cmake --build --target install for PyArrow") - self.spawn(['cmake', '--build', '.', '--config', self.build_type] + - ['--target', 'install'] + build_tool_args) - print("-- Finished cmake --build --target install for PyArrow") - - self._found_names = [] - for name in self.CYTHON_MODULE_NAMES: - built_path = pjoin(install_prefix, name + ext_suffix) - if os.path.exists(built_path): - self._found_names.append(name) - - def _get_build_dir(self): - # Get the package directory from build_py - build_py = self.get_finalized_command('build_py') - return build_py.get_package_dir('pyarrow') - - def _get_cmake_ext_path(self, name): - # This is the name of the arrow C-extension - filename = name + ext_suffix - return pjoin(self._get_build_dir(), filename) - - def get_ext_generated_cpp_source(self, name): - if sys.platform == 'win32': - head, tail = os.path.split(name) - return pjoin(head, tail + ".cpp") - else: - return pjoin(name + ".cpp") - - def get_ext_built_api_header(self, name): - if sys.platform == 'win32': - head, tail = os.path.split(name) - return pjoin(head, tail + "_api.h") - else: - return pjoin(name + "_api.h") - - def get_names(self): - return self._found_names - - def get_outputs(self): - # Just the C extensions - # regular_exts = _build_ext.get_outputs(self) - return [self._get_cmake_ext_path(name) - for name in self.get_names()] - - -class BinaryDistribution(Distribution): - def has_ext_modules(foo): - return True - - -class CopyLicenseSdist(sdist): - """Custom sdist command that copies license files from parent directory.""" - - def make_release_tree(self, base_dir, files): - # Call parent to do the normal work - super().make_release_tree(base_dir, files) - - # Define source (parent dir) and destination (sdist root) for license files - license_files = [ - ("LICENSE.txt", "../LICENSE.txt"), - ("NOTICE.txt", "../NOTICE.txt"), - ] - - for dest_name, src_path in license_files: - src_full = os.path.join(os.path.dirname(__file__), src_path) - dest_full = os.path.join(base_dir, dest_name) - - # Remove any existing file/symlink at destination - if os.path.exists(dest_full) or os.path.islink(dest_full): - os.unlink(dest_full) - - if not os.path.exists(src_full): - msg = f"Required license file not found: {src_full}" - raise FileNotFoundError(msg) - - shutil.copy2(src_full, dest_full) - print(f"Copied {src_path} to {dest_name} in sdist") - - -setup( - distclass=BinaryDistribution, - # Dummy extension to trigger build_ext - ext_modules=[Extension('__dummy__', sources=[])], - cmdclass={ - 'build_ext': build_ext, - 'sdist': CopyLicenseSdist, - }, -) From 15df0e4d3ce16d84dceef8c0f3d6937ea8fd6f01 Mon Sep 17 00:00:00 2001 From: vanshaj2023 Date: Thu, 12 Mar 2026 02:27:59 +0530 Subject: [PATCH 3/4] GH-49273: [Python] Add stub docstring injection to scikit-build-core backend - Add build_wheel wrapper to _build_backend/__init__.py that copies .pyi stubs into the wheel and injects docstrings from the built pyarrow runtime - Make libcst optional (graceful degradation when not installed) - Add wheel.exclude to prevent pyarrow/_build_utils from being packaged - Support PYARROW_SKIP_STUB_DOCSTRINGS=1 env var to skip injection --- python/_build_backend/__init__.py | 85 +++++++++++++++++++++++++++++++ python/pyproject.toml | 5 +- 2 files changed, 88 insertions(+), 2 deletions(-) diff --git a/python/_build_backend/__init__.py b/python/_build_backend/__init__.py index 6e5c328a69ff..099fd1e99020 100644 --- a/python/_build_backend/__init__.py +++ b/python/_build_backend/__init__.py @@ -31,17 +31,23 @@ The symlinks are restored afterwards to keep the git working tree clean. """ +import base64 from contextlib import contextmanager +import hashlib import os from pathlib import Path import shutil import sys +import tempfile +import zipfile from scikit_build_core.build import * # noqa: F401,F403 from scikit_build_core.build import build_sdist as scikit_build_sdist +from scikit_build_core.build import build_wheel as scikit_build_wheel LICENSE_FILES = ("LICENSE.txt", "NOTICE.txt") PYTHON_DIR = Path(__file__).resolve().parent.parent +PYTHON_STUBS_DIR = PYTHON_DIR / "pyarrow-stubs" / "pyarrow" @contextmanager @@ -66,3 +72,82 @@ def prepare_licenses(): def build_sdist(sdist_directory, config_settings=None): with prepare_licenses(): return scikit_build_sdist(sdist_directory, config_settings) + + +def build_wheel(wheel_directory, config_settings=None, metadata_directory=None): + wheel_name = scikit_build_wheel( + wheel_directory, config_settings, metadata_directory + ) + wheel_path = Path(wheel_directory) / wheel_name + _inject_stub_docstrings(wheel_path) + return wheel_name + + +def _inject_stub_docstrings(wheel_path): + """Extract wheel, copy stubs, inject docstrings from runtime, repack.""" + if not PYTHON_STUBS_DIR.exists(): + return + + if os.environ.get("PYARROW_SKIP_STUB_DOCSTRINGS", "0") == "1": + print("-- Skipping stub docstring injection (PYARROW_SKIP_STUB_DOCSTRINGS=1)") + return + + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_path = Path(tmp_dir) + + # Extract wheel into temp dir + with zipfile.ZipFile(wheel_path, "r") as whl: + whl.extractall(tmp_path) + + # Copy .pyi stubs alongside the built pyarrow package + pyarrow_dir = tmp_path / "pyarrow" + for stub_file in PYTHON_STUBS_DIR.rglob("*.pyi"): + rel = stub_file.relative_to(PYTHON_STUBS_DIR) + dest = pyarrow_dir / rel + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(stub_file, dest) + + # Inject docstrings extracted from the built pyarrow runtime + try: + from pyarrow._build_utils.update_stub_docstrings import ( + add_docstrings_from_build, + ) + add_docstrings_from_build(pyarrow_dir, tmp_path) + except ImportError as e: + print(f"-- Skipping stub docstring injection ({e})") + + # Repack the modified contents back into the wheel file + _repack_wheel(wheel_path, tmp_path) + + +def _repack_wheel(wheel_path, extracted_dir): + """Repack a wheel from an extracted directory, regenerating RECORD checksums.""" + dist_info_dirs = list(extracted_dir.glob("*.dist-info")) + if not dist_info_dirs: + raise RuntimeError("No .dist-info directory found in extracted wheel") + record_path = dist_info_dirs[0] / "RECORD" + record_rel = record_path.relative_to(extracted_dir) + + # Compute hashes for all files except RECORD itself + all_files = sorted( + f for f in extracted_dir.rglob("*") + if f.is_file() and f != record_path + ) + record_lines = [] + for f in all_files: + rel = f.relative_to(extracted_dir) + data = f.read_bytes() + digest = base64.urlsafe_b64encode( + hashlib.sha256(data).digest() + ).rstrip(b"=").decode() + record_lines.append(f"{rel},sha256={digest},{len(data)}") + record_lines.append(f"{record_rel},,") + record_path.write_text("\n".join(record_lines) + "\n") + + # Overwrite the original wheel file + tmp = wheel_path.with_suffix(".tmp") + with zipfile.ZipFile(tmp, "w", zipfile.ZIP_DEFLATED) as whl: + for f in sorted(extracted_dir.rglob("*")): + if f.is_file(): + whl.write(f, f.relative_to(extracted_dir)) + tmp.replace(wheel_path) diff --git a/python/pyproject.toml b/python/pyproject.toml index 5843b5937795..809b169a4af8 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -19,8 +19,8 @@ requires = [ "scikit-build-core", "cython >= 3.1", - # Needed for build-time stub docstring extraction - "libcst>=1.8.6", + # Optional: enables stub docstring injection during build + # "libcst>=1.8.6", "numpy>=1.25", "setuptools_scm[toml]>=8", ] @@ -88,6 +88,7 @@ metadata.version.provider = "scikit_build_core.metadata.setuptools_scm" sdist.include = ["pyarrow/_generated_version.py", "cmake_modules/"] wheel.packages = ["pyarrow"] wheel.install-dir = "pyarrow" +wheel.exclude = ["pyarrow/_build_utils/**"] [tool.scikit-build.cmake.define] PYARROW_BUNDLE_ARROW_CPP = {env = "PYARROW_BUNDLE_ARROW_CPP", default = "OFF"} From 553e68891813aa720313059b69354a413b297d7c Mon Sep 17 00:00:00 2001 From: vanshaj2023 Date: Thu, 12 Mar 2026 02:37:54 +0530 Subject: [PATCH 4/4] GH-49273: Fix wheel RECORD paths to use forward slashes --- python/_build_backend/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/_build_backend/__init__.py b/python/_build_backend/__init__.py index 099fd1e99020..be5f2775d914 100644 --- a/python/_build_backend/__init__.py +++ b/python/_build_backend/__init__.py @@ -135,13 +135,13 @@ def _repack_wheel(wheel_path, extracted_dir): ) record_lines = [] for f in all_files: - rel = f.relative_to(extracted_dir) + rel = f.relative_to(extracted_dir).as_posix() data = f.read_bytes() digest = base64.urlsafe_b64encode( hashlib.sha256(data).digest() ).rstrip(b"=").decode() record_lines.append(f"{rel},sha256={digest},{len(data)}") - record_lines.append(f"{record_rel},,") + record_lines.append(f"{record_rel.as_posix()},,") record_path.write_text("\n".join(record_lines) + "\n") # Overwrite the original wheel file