From 7e0775283cd250829be12b7fc8c0e768f51b4313 Mon Sep 17 00:00:00 2001 From: James Parrott <80779630+JamesParrott@users.noreply.github.com> Date: Wed, 27 May 2026 12:23:05 +0100 Subject: [PATCH 1/2] Restore edit and merge test shapefiles for run_benchmarks.py Add ShpWriter and ShxWriter. Reduce clutter from tests, by using temp dirs for writing shapefiles. Create ShpWriter as well as ShxWriter Bug fixes Delete merge & edit. Move test shapefiles out of test into parent dir (shapefiles) Update name of _replace_remote_url_with_localhost Restore _replace_remote_url_with_localhost as it's used by test_shapefiles.py Add clear_globs=True to doctest runner .run Quote replaced localhost URLs in doctests Close all Reader handles to shapefiles/tests, and reorder jobs to fail faster Don't close Reader before it's read from --- .../workflows/run_checks_build_and_test.yml | 12 +- README.md | 22 +- run_benchmarks.py | 4 +- shapefiles/{test => }/ATTRIBUTION | 0 shapefiles/{test => }/REL.zip | Bin shapefiles/corrupt_too_long.dbf | Bin 0 -> 580 bytes shapefiles/corrupt_too_long.shp | Bin 0 -> 1145 bytes shapefiles/corrupt_too_long.shx | Bin 0 -> 180 bytes shapefiles/{test => }/edit.dbf | Bin 11368 -> 11368 bytes shapefiles/{test => }/edit.shp | Bin shapefiles/{test => }/edit.shx | Bin shapefiles/{test => }/latin1.dbf | Bin shapefiles/{test => }/latin1.shp | Bin shapefiles/{test => }/merge.dbf | Bin 2355059 -> 2355059 bytes shapefiles/{test => }/merge.shp | Bin shapefiles/{test => }/merge.shx | Bin src/shapefile.py | 713 ++++++++++-------- test_shapefile.py | 10 +- 18 files changed, 433 insertions(+), 328 deletions(-) rename shapefiles/{test => }/ATTRIBUTION (100%) rename shapefiles/{test => }/REL.zip (100%) create mode 100644 shapefiles/corrupt_too_long.dbf create mode 100644 shapefiles/corrupt_too_long.shp create mode 100644 shapefiles/corrupt_too_long.shx rename shapefiles/{test => }/edit.dbf (99%) rename shapefiles/{test => }/edit.shp (100%) rename shapefiles/{test => }/edit.shx (100%) rename shapefiles/{test => }/latin1.dbf (100%) rename shapefiles/{test => }/latin1.shp (100%) rename shapefiles/{test => }/merge.dbf (99%) rename shapefiles/{test => }/merge.shp (100%) rename shapefiles/{test => }/merge.shx (100%) diff --git a/.github/workflows/run_checks_build_and_test.yml b/.github/workflows/run_checks_build_and_test.yml index eb8195b4..2366741c 100644 --- a/.github/workflows/run_checks_build_and_test.yml +++ b/.github/workflows/run_checks_build_and_test.yml @@ -57,18 +57,18 @@ jobs: fail-fast: false matrix: python-version: [ - "3.9", - "3.10", - "3.11", - "3.12", - "3.13", "3.14", + "3.13", + "3.12", "3.15-dev", + "3.11", + "3.10", + "3.9", ] os: [ - "macos-latest", "ubuntu-24.04", "windows-latest", + "macos-latest", ] diff --git a/README.md b/README.md index fd2d3e9c..91e205e4 100644 --- a/README.md +++ b/README.md @@ -940,6 +940,7 @@ length of text values to save space: >>> r = shapefile.Reader('shapefiles/test/dtype') >>> assert r.record(0) == ['Hello', 'World', 'World'*50] + >>> r.close() Date fields are created using the 'D' type, and can be created using either date objects, lists, or a YYYYMMDD formatted string. @@ -964,6 +965,7 @@ Field length or decimal have no impact on this type: >>> assert r.record(1) == [date(1998,1,30)] >>> assert r.record(2) == [date(1998,1,30)] >>> assert r.record(3) == [None] + >>> r.close() Numeric fields are created using the 'N' type (or the 'F' type, which is exactly the same). By default the fourth decimal argument is set to zero, essentially creating an integer field. @@ -989,6 +991,7 @@ To store very large numbers you must increase the field length size to the total >>> r = shapefile.Reader('shapefiles/test/dtype') >>> assert r.record(0) == [1, 1.32, 1.3217328, -3.2302e-25, 1.3217328, 10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000] >>> assert r.record(1) == [None, None, None, None, None, None] + >>> r.close() Finally, we can create boolean fields by setting the type to 'L'. @@ -1025,6 +1028,7 @@ None is interpreted as missing. Record #4: [None] >>> r.record(5) Record #5: [None] + >>> r.close() You can also add attributes using keyword arguments where the keys are field names. @@ -1150,6 +1154,7 @@ This can be particularly useful for copying from one file to another: ... w.record(*shaperec.record) ... w.shape(shaperec.shape.__geo_interface__) + >>> r.close() >>> w.close() @@ -1256,7 +1261,7 @@ For reading shapefiles in any non-utf8 encoding, such as Latin-1, just supply the encoding option when creating the Reader class. - >>> r = shapefile.Reader("shapefiles/test/latin1.shp", encoding="latin1") + >>> r = shapefile.Reader("shapefiles/latin1.shp", encoding="latin1") >>> r.record(0) == [2, u'Ñandú'] True @@ -1269,11 +1274,13 @@ should give you the same unicode string you started with. >>> w.fields = r.fields[1:] >>> w.record(*r.record(0)) >>> w.null() + >>> r.close() >>> w.close() >>> r = shapefile.Reader("shapefiles/test/latin_as_utf8.shp", encoding="utf8") >>> r.record(0) == [2, u'Ñandú'] True + >>> r.close() If you supply the wrong encoding and the string is unable to be decoded, PyShp will by default raise an exception. If however, on rare occasion, you are unable to find the correct encoding and want to ignore @@ -1281,9 +1288,10 @@ or replace encoding errors, you can specify the "encodingErrors" to be used by t applies to both reading and writing. - >>> r = shapefile.Reader("shapefiles/test/latin1.shp", encoding="ascii", encodingErrors="replace") + >>> r = shapefile.Reader("shapefiles/latin1.shp", encoding="ascii", encodingErrors="replace") >>> r.record(0) == [2, u'�and�'] True + >>> r.close() @@ -1491,6 +1499,8 @@ Shapefiles containing M-values can be examined in several ways: >>> r.shape(0).m # flat list of M-values [0.0, None, 3.0, None, 0.0, None, None] + >>> r.close() + ### Shapefiles with elevation (Z) values @@ -1524,6 +1534,8 @@ To examine a Z-type shapefile you can do: >>> r.shape(0).z # flat list of Z-values [18.0, 20.0, 22.0, 0.0, 0.0, 0.0, 0.0, 15.0, 13.0, 14.0] + >>> r.close() + ### 3D MultiPatch Shapefiles Multipatch shapes are useful for storing composite 3-Dimensional objects. @@ -1583,8 +1595,6 @@ installed) from the test file: python test_shapefile.py ``` -Linux/Mac and similar platforms may need to run `$ dos2unix README.md` in order -to correct line endings in README.md, if Git has not automatically changed them. ## Network tests @@ -1598,7 +1608,7 @@ pytest -m "not network" ``` or the doctests via: ```shell -python shapefile.py -m "not network" +python test_shapefile.py -m "not network" ``` or ii) by cloning a repo of the files they download, serving these on localhost in a separate process, and running the network tests with the environment variable REPLACE_REMOTE_URLS_WITH_LOCALHOST to `yes`: @@ -1614,7 +1624,7 @@ REPLACE_REMOTE_URLS_WITH_LOCALHOST=yes && pytest ``` or the doctests via: ```bash -REPLACE_REMOTE_URLS_WITH_LOCALHOST=yes && python shapefile.py +REPLACE_REMOTE_URLS_WITH_LOCALHOST=yes && python test_shapefile.py ``` The network tests alone can also be run (without also running all the tests that don't make network requests) using: `pytest -m network` (or the doctests using: `python shapefile.py -m network`). diff --git a/run_benchmarks.py b/run_benchmarks.py index 28017913..31e2265e 100644 --- a/run_benchmarks.py +++ b/run_benchmarks.py @@ -23,8 +23,8 @@ blockgroups_file = REPO_ROOT / "shapefiles" / "blockgroups.shp" -edit_file = REPO_ROOT / "shapefiles" / "test" / "edit.shp" -merge_file = REPO_ROOT / "shapefiles" / "test" / "merge.shp" +edit_file = REPO_ROOT / "shapefiles" / "edit.shp" +merge_file = REPO_ROOT / "shapefiles" / "merge.shp" states_provinces_file = PYSHP_TEST_REPO / "ne_10m_admin_1_states_provinces.shp" tiny_countries_file = PYSHP_TEST_REPO / "ne_110m_admin_0_tiny_countries.shp" gis_osm_natural_file = PYSHP_TEST_REPO / "gis_osm_natural_a_free_1.zip" diff --git a/shapefiles/test/ATTRIBUTION b/shapefiles/ATTRIBUTION similarity index 100% rename from shapefiles/test/ATTRIBUTION rename to shapefiles/ATTRIBUTION diff --git a/shapefiles/test/REL.zip b/shapefiles/REL.zip similarity index 100% rename from shapefiles/test/REL.zip rename to shapefiles/REL.zip diff --git a/shapefiles/corrupt_too_long.dbf b/shapefiles/corrupt_too_long.dbf new file mode 100644 index 0000000000000000000000000000000000000000..3eb084b97c99eb90ab1fb3b8fa340329fea444ca GIT binary patch literal 580 zcmZRsW0mG&U|?`$Fb0yCpd_`p1S;wbqK(j$@G6ug=9H!?kOW4x43`iyG%_|Z1puaH BQv(11 literal 0 HcmV?d00001 diff --git a/shapefiles/corrupt_too_long.shp b/shapefiles/corrupt_too_long.shp new file mode 100644 index 0000000000000000000000000000000000000000..e9ef9cf1074e45992a145aba00b62bb1e3e78a85 GIT binary patch literal 1145 zcmZQzQ0HR63YctOFf%X!5fpr|hcI9?g9B6oMnkz+DMp~E0WoHQ6e84t%!1j6PGi{b x094Eb#PqNSRP Q3rf#}(w87K6B~#I07&!-R{#J2 literal 0 HcmV?d00001 diff --git a/shapefiles/test/edit.dbf b/shapefiles/edit.dbf similarity index 99% rename from shapefiles/test/edit.dbf rename to shapefiles/edit.dbf index 8854e03229f03fc52e5bc7b8e55dbb4f052f2176..c9d39bab2a496349935f15f8303c1b9c55e17636 100644 GIT binary patch delta 13 UcmaD6@gjnSxsFwOBTI@703~z;YXATM delta 13 UcmaD6@gjnSxr&W(BTI@703_T5P5=M^ diff --git a/shapefiles/test/edit.shp b/shapefiles/edit.shp similarity index 100% rename from shapefiles/test/edit.shp rename to shapefiles/edit.shp diff --git a/shapefiles/test/edit.shx b/shapefiles/edit.shx similarity index 100% rename from shapefiles/test/edit.shx rename to shapefiles/edit.shx diff --git a/shapefiles/test/latin1.dbf b/shapefiles/latin1.dbf similarity index 100% rename from shapefiles/test/latin1.dbf rename to shapefiles/latin1.dbf diff --git a/shapefiles/test/latin1.shp b/shapefiles/latin1.shp similarity index 100% rename from shapefiles/test/latin1.shp rename to shapefiles/latin1.shp diff --git a/shapefiles/test/merge.dbf b/shapefiles/merge.dbf similarity index 99% rename from shapefiles/test/merge.dbf rename to shapefiles/merge.dbf index 4164f11460ac38bea5590b26d7d4cd9539287306..4c64e86a63dc0e0d508deda31c269068acd4fe0a 100644 GIT binary patch delta 122 zcmWN@$qj-4006-t3nF-e;DHBVfplQPW8YRoJGS712IMixOy5s1%?RI7Js^QBB$WA) u#Ilq`R+364YuU(FcCwd)9`ck%TIoh!{qGMhnH;PD delta 122 zcmWN@xeV;`2B=&Li9?)8woO#NT$b< t$XrrcNG7=~rI3}ZrIboGvX!0e bool: if not f: return False + # If it doesn't declare itself closed, assume it's open. return not getattr(f, "closed", False) @@ -3772,11 +3774,24 @@ def __init__(self, file: str | PathLike[Any] | WriteSeekableBinStream): def file(self) -> WriteSeekableBinStream: return self._ensure_file_obj() + def is_open(self) -> bool: + return bool(self.file and _is_file_obj_open(self.file)) + + def _header(self) -> None: + raise NotImplementedError + + def close(self) -> None: + self._header() + if self.is_open(): + _try_to_flush_file_obj(self.file) + super().close() + class DbfWriter(_HasCheckedWriteableFile): """Writes .dbf files (dBASE database files), in particular those of Shapefiles.""" ext = ".dbf" + ExceptionClass = dbfFileException def __init__( self, @@ -3796,20 +3811,6 @@ def __init__( self.fields: list[Field] = [] self.max_num_fields = max_num_fields self.recNum = 0 - self.deletionFlag = 0 - - def close(self) -> None: - """ - Write final dbf header, close opened files. - """ - - # Update the dbf header with final length etc - if _is_file_obj_open(self.file): - self._dbfHeader() - - _try_to_flush_file_obj(self.file) - - super().close() def field( # Types of args should match *Field @@ -3827,7 +3828,7 @@ def field( field_ = Field.from_unchecked(name, field_type, size, decimal) self.fields.append(field_) - def _dbfHeader(self) -> None: + def _header(self) -> None: """Writes the dbf header and field descriptors.""" f = self.file f.seek(0) @@ -3920,7 +3921,7 @@ def __dbfRecord(self, record: list[RecordValue]) -> None: # first records, so all fields should be set # allowing us to write the dbf header # cannot change the fields after this point - self._dbfHeader() + self._header() # first byte of the record is deletion flag, always disabled f.write(b" ") # begin @@ -4007,205 +4008,115 @@ def __dbfRecord(self, record: list[RecordValue]) -> None: f.write(encoded) -class Writer(_FileObjChecker[WriteSeekableBinStream]): - """Provides write support for ESRI Shapefiles.""" - - # W = TypeVar("W", bound=WriteSeekableBinStream) +class _ShpShxHeaderWriter(_HasCheckedWriteableFile): + def __init__(self, file: str | PathLike[Any] | WriteSeekableBinStream): + super().__init__(file=file) + # Initiate with empty headers, to be finalized upon closing + self.file.seek(0) + self.file.write(b"9" * 100) - FileProto = WriteSeekableBinStream - new_file_obj_mode = "w+b" - ext = None - ExceptionClass = ShapefileException + def _write_file_length(self) -> None: + raise NotImplementedError - def __init__( - self, - target: str | PathLike[Any] | None = None, - shapeType: int | None = None, - autoBalance: bool = False, - *, - encoding: str = "utf-8", - encodingErrors: str = "strict", - shp: WriteSeekableBinStream | None = None, - shx: WriteSeekableBinStream | None = None, - dbf: WriteSeekableBinStream | None = None, - # Keep kwargs even though unused, to preserve PyShp 2.4 API - **kwargs: Any, - ): - # Don't call super().__init - if target is not None: - try: - target = Path(target) - except (ValueError, TypeError): - raise TypeError( - f"The target filepath {target!r} must be a str, Path, or os.PathLike, not {type(target)}." - ) - self.target: Path | None = target + def _shp_or_shx_header(self, shp_info: _ShpWriterInfo) -> None: + """Writes the specified header type to the specified file-like object. + Several of the shapefile formats are so similar that a single generic + method to read or write them is warranted.""" + f = self.file + f.seek(0) + # File code, Unused bytes + f.write(pack(">6i", 9994, 0, 0, 0, 0, 0)) - # User settable - see ### Geometry and Record Balancing in README.md - self.autoBalance = autoBalance + # File length (Bytes / 2 = 16-bit words). + # Hook for shx and shp specific subclasses to implement. + self._write_file_length() - # User settable - see #### Setting the Shape Type in README.md - self.shapeType = shapeType + f.write(pack("<2i", 1000, shp_info.shapeType or NULL)) - self._shp: Path | WriteSeekableBinStream | None = shp - self._shx: Path | WriteSeekableBinStream | None = shx - self._dbf: Path | WriteSeekableBinStream | None = dbf - self._dbf_writer: DbfWriter | None = None - self.exit_stack = ExitStack() - if target is not None: - if shp or shx or dbf: - raise TypeError( - "Unused kwargs were silently ignored by previous versions of PyShp. " - "Either specify target (first positional only arg), " - "or shp and/or dbf, possible plus shx" - ) - self._shp = target.with_suffix(".shp") - self._shx = target.with_suffix(".shx") - self._dbf = target.with_suffix(".dbf") - elif not (shp or shx or dbf): - raise TypeError( - "Either the target filepath, or any of shp, shx, or dbf must be set to create a shapefile." - ) - if self._dbf: - self._dbf_writer = DbfWriter( - dbf=self._dbf, - encoding=encoding, - encodingErrors=encodingErrors, + # BBox, the shapefile's bounding box (lower left, upper right) + # In such cases of empty shapefiles, ESRI spec says the bbox values are 'unspecified'. + # Not sure what that means, so for now just setting to 0s, which is the same behavior as in previous versions. + # This would also make sense since the Z and M bounds are similarly set to 0 for non-Z/M type shapefiles. + # bbox: BBox = (0, 0, 0, 0) + bbox = shp_info._bbox or (0, 0, 0, 0) + try: + f.write(pack("<4d", *bbox)) + except StructError: + raise ShapefileException( + "Failed to write shapefile bounding box. Floats required." ) - # Initiate with empty headers, to be finalized upon closing - if self._shp: - self.shp.write(b"9" * 100) - if self._shx: - self.shx.write(b"9" * 100) - # Geometry record offsets and lengths for writing shx file. - self.shpNum = 0 - self._bbox: BBox | None = None - self._zbox: ZBox | None = None - self._mbox: MBox | None = None - # Use deletion flags in dbf? Default is false (0). Note: Currently has no effect, records should NOT contain deletion flags. - self.deletionFlag = 0 - @functools.cached_property - def shp(self) -> WriteSeekableBinStream: - return self._ensure_file_obj( - f=self._shp, - ) + # Elevation + # As per the ESRI shapefile spec, the zbox for non-Z type shapefiles are set to 0s + # zbox : ZBox = (0, 0). We also do this for empty shapefiles and null-shapes only files. + zbox = shp_info._zbox or (0, 0) - @functools.cached_property - def shx(self) -> WriteSeekableBinStream: - return self._ensure_file_obj( - f=self._shx, - ) + # Ms + # As per the ESRI shapefile spec, the mbox for non-M type shapefiles are set to 0s + # mbox: Mbox = (0, 0). We also do this for empty shapefiles and null-shapes only files. + mbox = shp_info._mbox or (0, 0) - @functools.cached_property - def dbf_writer(self) -> DbfWriter: - if self._dbf_writer is None: - raise dbfFileException( - f"No dbf file. Got target: {self.target} & dbf: {self._dbf}" + # Try writing + try: + f.write(pack("<4d", *zbox, *replace_None_with_NODATA(mbox))) + except StructError: + raise ShapefileException( + "Failed to write shapefile elevation and measure values. Floats required." ) - self.exit_stack.enter_context(self._dbf_writer) - return self._dbf_writer - - @property - def dbf(self) -> WriteSeekableBinStream: - return self.dbf_writer.file - - @property - def fields(self) -> list[Field]: - return self.dbf_writer.fields - - @fields.setter - def fields(self, value: list[Field]) -> None: - self.dbf_writer.fields = value - - @property - def recNum(self) -> int: - if not self._dbf_writer: - return 0 - return self.dbf_writer.recNum - - def __len__(self) -> int: - """Returns the current number of features written to the shapefile. - If shapes and records are unbalanced, the length is considered the highest - of the two.""" - if not self._dbf_writer: - return self.shpNum - return max(self.dbf_writer.recNum, self.shpNum) - def __enter__(self) -> Writer: - return self - def __exit__( +class _ShpWriterInfo(_ShpShxHeaderWriter): + def __init__( self, - exc_type: type[BaseException] | None, - exc_val: BaseException | None, - exc_tb: TracebackType | None, - ) -> None: - self.close() + shp: str | PathLike[Any] | WriteSeekableBinStream, + shapeType: int | None = None, + ): + super().__init__(file=shp) + self.shapeType: int | None = shapeType + # Geometry record offsets and lengths for writing shx file. + self.shpNum: int = 0 + self._bbox: BBox | None = None + self._zbox: ZBox | None = None + self._mbox: MBox | None = None - def __del__(self) -> None: - self.close() + def bbox(self) -> BBox | None: + """Returns the current bounding box for the shapefile which is + the lower-left and upper-right corners. It does not contain the + elevation or measure extremes.""" + return self._bbox - def close(self) -> None: - """ - Write final shp, shx, and dbf headers, close opened files. - """ + def zbox(self) -> ZBox | None: + """Returns the current z extremes for the shapefile.""" + return self._zbox - # Check if user supplied shp or dbf file objects have - # already been closed by the user for some reason. - # - # TODO: Do we really need to support this? A user who supplies - # custom file objects for shp and dbf, opens a Writer - # to partially write the Shapefile, manually closes the .shp object - # but uses the context manager or calls .close expecting - # Writer to create the dbf's header? Really? - shp_open = self._shp and _is_file_obj_open(self.shp) - dbf_open = ( - False - if self._dbf_writer is None - else _is_file_obj_open(self.dbf_writer.file) - ) + def mbox(self) -> MBox | None: + """Returns the current m extremes for the shapefile.""" + return self._mbox - # Balance if already not balanced - if shp_open and dbf_open: - if self.autoBalance: - self.balance() - if self.dbf_writer.recNum != self.shpNum: - raise ShapefileException( - "When saving both the dbf and shp file, " - f"the number of records ({self.dbf_writer.recNum}) must correspond " - f"with the number of shapes ({self.shpNum})" - ) - # Fill in the blank headers and flush files - if shp_open: - self._shp_or_shx_header(self.shp, headerType="shp") - _try_to_flush_file_obj(self.shp) +class ShpWriter(_ShpWriterInfo): + ext = ".shp" - if self._shx and _is_file_obj_open(self.shx): - self._shp_or_shx_header(self.shx, headerType="shx") - _try_to_flush_file_obj(self.shx) + def _header(self) -> None: + super()._shp_or_shx_header(shp_info=self) - # Ensure any files that the writer opened are closed. - # (contains self.dbf_writer, which is triggered to - # writes its header here, but should not contain - # user-supplied, already opened file objects that - # might be closed by an outer context manager). - # Idempotent. - self.exit_stack.close() + def _write_file_length(self) -> None: + # self.file required to be at correct position, e.g. + # if called by self._header + self.file.write(pack(">i", self._shp_file_length_B())) def _shp_file_length_B(self) -> int: """Calculates the file length of the shp file.""" # Remember starting position - start_B = self.shp.tell() + start_B = self.file.tell() # Calculate size of all shapes - self.shp.seek(0, 2) - size_16b_words = self.shp.tell() + self.file.seek(0, 2) + size_16b_words = self.file.tell() # Calculate size as 16-bit words size_B = size_16b_words // 2 # Return to start - self.shp.seek(start_B) + self.file.seek(start_B) return size_B def _update_file_bbox(self, s: Shape) -> None: @@ -4253,84 +4164,10 @@ def _update_file_mbox(self, s: _HasM | PointM) -> None: # first time mbox is being set self._mbox = mbox - @property - def shapeTypeName(self) -> str: - return SHAPETYPE_LOOKUP[self.shapeType or 0] - - def bbox(self) -> BBox | None: - """Returns the current bounding box for the shapefile which is - the lower-left and upper-right corners. It does not contain the - elevation or measure extremes.""" - return self._bbox - - def zbox(self) -> ZBox | None: - """Returns the current z extremes for the shapefile.""" - return self._zbox - - def mbox(self) -> MBox | None: - """Returns the current m extremes for the shapefile.""" - return self._mbox - - def _shp_or_shx_header( - self, - f: WriteSeekableBinStream, - headerType: Literal["shp", "shx"], - ) -> None: - """Writes the specified header type to the specified file-like object. - Several of the shapefile formats are so similar that a single generic - method to read or write them is warranted.""" - - f.seek(0) - # File code, Unused bytes - f.write(pack(">6i", 9994, 0, 0, 0, 0, 0)) - # File length (Bytes / 2 = 16-bit words) - if headerType == "shp": - f.write(pack(">i", self._shp_file_length_B())) - elif headerType == "shx": - f.write(pack(">i", ((100 + (self.shpNum * 8)) // 2))) - # Version, Shape type - if self.shapeType is None: - self.shapeType = NULL - f.write(pack("<2i", 1000, self.shapeType)) - - # BBox, the shapefile's bounding box (lower left, upper right) - # In such cases of empty shapefiles, ESRI spec says the bbox values are 'unspecified'. - # Not sure what that means, so for now just setting to 0s, which is the same behavior as in previous versions. - # This would also make sense since the Z and M bounds are similarly set to 0 for non-Z/M type shapefiles. - # bbox: BBox = (0, 0, 0, 0) - bbox = self.bbox() or (0, 0, 0, 0) - try: - f.write(pack("<4d", *bbox)) - except StructError: - raise ShapefileException( - "Failed to write shapefile bounding box. Floats required." - ) - - # Elevation - # As per the ESRI shapefile spec, the zbox for non-Z type shapefiles are set to 0s - # zbox : ZBox = (0, 0). We also do this for empty shapefiles and null-shapes only files. - zbox = self.zbox() or (0, 0) - - # Ms - # As per the ESRI shapefile spec, the mbox for non-M type shapefiles are set to 0s - # mbox: Mbox = (0, 0). We also do this for empty shapefiles and null-shapes only files. - mbox = self.mbox() or (0, 0) - - # Try writing - try: - f.write(pack("<4d", *zbox, *replace_None_with_NODATA(mbox))) - except StructError: - raise ShapefileException( - "Failed to write shapefile elevation and measure values. Floats required." - ) - def shape( self, s: Shape | HasGeoInterface | GeoJSONHomogeneousGeometryObject, - ) -> None: - # Balance if already not balanced - if self.autoBalance and self.dbf_writer.recNum < self.shpNum: - self.balance() + ) -> tuple[int, int]: # Check is shape or import from geojson if not isinstance(s, Shape): if hasattr(s, "__geo_interface__"): @@ -4346,12 +4183,10 @@ def shape( ) s = Shape._from_geojson(shape_dict) # Write to file - offset_B, length_16bw = self._shp_record(s) - if self._shx: - self._shx_record(offset_B, length_16bw) + return self._shp_record(s) def _shp_record(self, s: Shape) -> tuple[int, int]: - offset = self.shp.tell() + offset = self.file.tell() self.shpNum += 1 # Shape Type @@ -4402,18 +4237,31 @@ def _shp_record(self, s: Shape) -> tuple[int, int]: length_16bw = n // 2 # 4 bytes in is the content length field + # (where we wrote the -1 placeholder above) b_io.seek(4) b_io.write(pack(">i", length_16bw)) # Flush to file. b_io.seek(0) - self.shp.write(b_io.read()) + self.file.write(b_io.read()) return offset, length_16bw + +class ShxWriter(_ShpShxHeaderWriter): + ext = ".shx" + + def __init__( + self, + shx: str | PathLike[Any] | WriteSeekableBinStream, + shp_writer: _ShpWriterInfo, + ): + super().__init__(file=shx) + self.shp_writer = shp_writer + def _shx_record(self, offset_B: int, length_16bw: int) -> None: """Writes the shx records.""" - f = self.shx + f = self.file max_value_B = (1 << 32) - 2 # 4294967294 Bytes if offset_B > max_value_B: raise ShapefileException( @@ -4424,16 +4272,264 @@ def _shx_record(self, offset_B: int, length_16bw: int) -> None: offset_16bw = offset_B // 2 f.write(pack(">2i", offset_16bw, length_16bw)) + def _header(self) -> None: + super()._shp_or_shx_header(shp_info=self.shp_writer) + + def _write_file_length(self) -> None: + # self.file required to be at correct position, e.g. + # if called by self._header + self.file.write(pack(">i", ((100 + (self.shp_writer.shpNum * 8)) // 2))) + + +class Writer(_HasExitStack): + """Provides write support for ESRI Shapefiles.""" + + # W = TypeVar("W", bound=WriteSeekableBinStream) + + FileProto = WriteSeekableBinStream + new_file_obj_mode = "w+b" + ext = None + ExceptionClass = ShapefileException + + def __init__( + self, + target: str | PathLike[Any] | None = None, + shapeType: int | None = None, + autoBalance: bool = False, + *, + encoding: str = "utf-8", + encodingErrors: str = "strict", + shp: WriteSeekableBinStream | None = None, + shx: WriteSeekableBinStream | None = None, + dbf: WriteSeekableBinStream | None = None, + # Keep kwargs even though unused, to preserve PyShp 2.4 API + **kwargs: Any, + ) -> None: + # Don't call super().__init + if target is not None: + try: + target = Path(target) + except (ValueError, TypeError): + raise TypeError( + f"The target filepath {target!r} must be a str, Path, or os.PathLike, not {type(target)}." + ) + self.target: Path | None = target + + # User settable - see ### Geometry and Record Balancing in README.md + self.autoBalance = autoBalance + + self._shp: Path | WriteSeekableBinStream | None = shp + self._shx: Path | WriteSeekableBinStream | None = shx + self._dbf: Path | WriteSeekableBinStream | None = dbf + self._shp_writer: ShpWriter | None = None + self._shx_writer: ShxWriter | None = None + self._dbf_writer: DbfWriter | None = None + self.exit_stack = ExitStack() + if target is not None: + if shp or shx or dbf: + raise TypeError( + "Unused kwargs were silently ignored by previous versions of PyShp. " + "Either specify target (first positional only arg), " + "or shp and/or dbf, possible plus shx" + ) + self._shp = target.with_suffix(".shp") + self._shx = target.with_suffix(".shx") + self._dbf = target.with_suffix(".dbf") + elif not (shp or shx or dbf): + raise TypeError( + "Either the target filepath, or any of shp, shx, or dbf must be set to create a shapefile." + ) + if self._shp: + self._shp_writer = ShpWriter(shp=self._shp, shapeType=shapeType) + self.exit_stack.enter_context(self._shp_writer) + if self._shx: + if self._shp_writer is None: + raise ShapefileException( + "a .shp file object is required to for a .shx file object " + ", to have shapes to actually create indices for. " + ) + self._shx_writer = ShxWriter(shx=self._shx, shp_writer=self._shp_writer) + self.exit_stack.enter_context(self._shx_writer) + if self._dbf: + self._dbf_writer = DbfWriter( + dbf=self._dbf, + encoding=encoding, + encodingErrors=encodingErrors, + ) + self.exit_stack.enter_context(self._dbf_writer) + + @property + def shp_writer(self) -> ShpWriter: + if self._shp_writer is None: + raise ShapefileException( + f"No shp file. Got target: {self.target} & shp: {self._shp}" + ) + return self._shp_writer + + @property + def shx_writer(self) -> ShxWriter: + if self._shx_writer is None: + raise ShapefileException( + f"No shx file. Got target: {self.target} & shx: {self._shx}" + ) + return self._shx_writer + + @property + def dbf_writer(self) -> DbfWriter: + if self._dbf_writer is None: + raise dbfFileException( + f"No dbf file. Got target: {self.target} & dbf: {self._dbf}" + ) + return self._dbf_writer + + @property + def shp(self) -> WriteSeekableBinStream: + return self.shp_writer.file + + @property + def shx(self) -> WriteSeekableBinStream: + return self.shx_writer.file + + @property + def dbf(self) -> WriteSeekableBinStream: + return self.dbf_writer.file + + @property + def fields(self) -> list[Field]: + return self.dbf_writer.fields + + @fields.setter + def fields(self, value: list[Field]) -> None: + self.dbf_writer.fields = value + + @property + def recNum(self) -> int: + if not self._dbf_writer: + return 0 + return self.dbf_writer.recNum + + def balance(self) -> None: + """Adds corresponding empty attributes or null geometry records depending + on which type of record was created to make sure all three files + are in synch.""" + while self.dbf_writer.recNum > self.shp_writer.shpNum: + self.null() + while self.dbf_writer.recNum < self.shp_writer.shpNum: + self.record() + + def shape( + self, + s: Shape | HasGeoInterface | GeoJSONHomogeneousGeometryObject, + ) -> None: + # Balance if already not balanced + if self.autoBalance and self.dbf_writer.recNum < self.shp_writer.shpNum: + self.balance() + offset_B, length_16bw = self.shp_writer.shape(s) + if self._shx: + self.shx_writer._shx_record(offset_B, length_16bw) + def record( self, *recordList: RecordValue, **recordDict: RecordValue, ) -> None: # Balance if already not balanced - if self.autoBalance and self.dbf_writer.recNum > self.shpNum: + if self.autoBalance and self.dbf_writer.recNum > self.shp_writer.shpNum: self.balance() self.dbf_writer.record(*recordList, **recordDict) + def __len__(self) -> int: + """Returns the current number of features written to the shapefile. + If shapes and records are unbalanced, the length is considered the highest + of the two.""" + + if self._dbf_writer and self._shp_writer: + return max(self.dbf_writer.recNum, self.shp_writer.shpNum) + if self._shp_writer: + return self.shp_writer.shpNum + if self._dbf_writer: + return self.dbf_writer.recNum + return 0 + + def __del__(self) -> None: + self.close() + + def close(self) -> None: + """ + Write final shp, shx, and dbf headers, close opened files. + """ + + # Check if user supplied shp or dbf file objects have + # already been closed by the user for some reason. + # + # TODO: Do we really need to support this? + # Isn't "for some reason" an edge case by definition? + # A user who supplies custom file objects for shp and + # dbf, opens a Writer + # to partially write the Shapefile, manually closes the .shp object + # but uses the context manager or calls .close expecting + # Writer to create the dbf's header? + shp_open = False if self._shp_writer is None else self.shp_writer.is_open() + dbf_open = False if self._dbf_writer is None else self.dbf_writer.is_open() + # dbf_open = ( + # False + # if self._dbf_writer is None + # else _is_file_obj_open(self.dbf_writer.file) + # ) + + # Balance if already not balanced + if shp_open and dbf_open: + if self.autoBalance: + self.balance() + if self.dbf_writer.recNum != self.shp_writer.shpNum: + raise ShapefileException( + "When saving both the dbf and shp file, " + f"the number of records ({self.dbf_writer.recNum}) must correspond " + f"with the number of shapes ({self.shp_writer.shpNum})" + ) + + # self.exit_stack contains any self.{shp,shx,dbf}_writer instances. + # Those created are triggered to write their headers here. + # The point of using the exit stack is + # i) that the above are guaranteed to be pushed on to + # it to dynamically if accessed via their properties. + # ii) but in particular it should not contain + # user-supplied, already opened file objects that + # might be closed by an outer context manager). + # + # Idempotent. + super().close() + + @property + def shapeType(self) -> int | None: + return self.shp_writer.shapeType + + @shapeType.setter + def shapeType(self, val: int | None) -> None: + # User settable - see #### Setting the Shape Type in README.md + self.shp_writer.shapeType = val + + @property + def shapeTypeName(self) -> str: + return SHAPETYPE_LOOKUP[self.shapeType or 0] + + @property + def shpNum(self) -> int: + return self.shp_writer.shpNum + + @functools.wraps(_ShpWriterInfo.bbox) + def bbox(self) -> BBox | None: + return self.shp_writer._bbox + + @functools.wraps(_ShpWriterInfo.zbox) + def zbox(self) -> ZBox | None: + return self.shp_writer._zbox + + @functools.wraps(_ShpWriterInfo.mbox) + def mbox(self) -> MBox | None: + return self.shp_writer._mbox + + @functools.wraps(DbfWriter.field) def field( # Types of args should match *Field self, @@ -4444,15 +4540,6 @@ def field( ) -> None: self.dbf_writer.field(name, field_type, size, decimal) - def balance(self) -> None: - """Adds corresponding empty attributes or null geometry records depending - on which type of record was created to make sure all three files - are in synch.""" - while self.dbf_writer.recNum > self.shpNum: - self.null() - while self.dbf_writer.recNum < self.shpNum: - self.record() - def null(self) -> None: """Creates a null shape.""" self.shape(NullShape()) @@ -4620,45 +4707,46 @@ def _filter_network_doctests( yield example -def _replace_remote_url( - old_url: str, - # Default port of Python http.server - port: int = 8000, - scheme: str = "http", - netloc: str = "localhost", - path: str | None = None, - params: str = "", - query: str = "", - fragment: str = "", -) -> str: +def _replace_remote_url_with_localhost(old_url: str) -> str: + old_split = urlsplit(old_url) # Strip subpaths, so an artefacts # repo or file tree can be simpler and flat - if path is None: - path = old_split.path.rpartition("/")[2] - - if port not in (None, ""): # type: ignore[comparison-overlap] - netloc = f"{netloc}:{port}" + path = old_split.path.rpartition("/")[2] new_split = old_split._replace( - scheme=scheme, - netloc=netloc, + scheme="http", + netloc="localhost:8000", # Default port of Python http.server path=path, - query=query, - fragment=fragment, + query="", + fragment="", ) - new_url = urlunsplit(new_split) - return new_url + return str(urlunsplit(new_split)) + + +_URL_STR_LITERAL_PATTERN = r'"(https?://.*)"' + + +def _change_remote_url_match_to_localhost( + match: re.Match[Any], # A Match from _URL_STR_LITERAL_PATTERN above +) -> str: + + old_url = match.group(1) + new_url = _replace_remote_url_with_localhost(old_url) + return f'"{new_url}"' + +def _test( + temp_dir: str | None = None, + args: list[str] = sys.argv[1:], + verbosity: bool = False, +) -> int: -def _test(args: list[str] = sys.argv[1:], verbosity: bool = False) -> int: if verbosity == 0: print("Getting doctests...") - import re - tests = _get_doctests() if len(args) >= 2 and args[0] == "-m": @@ -4677,17 +4765,23 @@ def _test(args: list[str] = sys.argv[1:], verbosity: bool = False) -> int: print("Replacing remote urls with http://localhost in doctests...") for example in tests.examples: - match_url_str_literal = re.search(r'"(https://.*)"', example.source) - if not match_url_str_literal: - continue - old_url = match_url_str_literal.group(1) - new_url = _replace_remote_url(old_url) - example.source = example.source.replace(old_url, new_url) + example.source = re.sub( + pattern=_URL_STR_LITERAL_PATTERN, + repl=_change_remote_url_match_to_localhost, + string=example.source, + ) + + if temp_dir is not None: + for example in tests.examples: + example.source = example.source.replace("shapefiles/test/", f"{temp_dir}/") runner = doctest.DocTestRunner(verbose=verbosity, optionflags=doctest.FAIL_FAST) if verbosity == 0: print(f"Running {len(tests.examples)} doctests...") + # Deleting a temp dir will error if it contains shapefiles to which + # unclosed Readers still file objects open, + # regardless of using clear_globs=True or calling gc.collect afterwards. failure_count, __test_count = runner.run(tests) # print results @@ -4707,7 +4801,8 @@ def main() -> None: Doctests are contained in the file 'README.md', and are tested using the built-in testing libraries. """ - failure_count = _test() + with tempfile.TemporaryDirectory() as td: + failure_count = _test(Path(td).as_posix()) sys.exit(failure_count) diff --git a/test_shapefile.py b/test_shapefile.py index a865385e..d90843e2 100644 --- a/test_shapefile.py +++ b/test_shapefile.py @@ -495,7 +495,7 @@ def test_reader_nvkelso_files_from_localhost_url(): # defined in ./.github/actions/test/actions.yml def Reader(url): - new_url = shapefile._replace_remote_url(url) + new_url = shapefile._replace_remote_url_with_localhost(url) print(f"repr(new_url): {repr(new_url)}") return shapefile.Reader(new_url) @@ -532,7 +532,7 @@ def test_reader_urls(): if shapefile.REPLACE_REMOTE_URLS_WITH_LOCALHOST: def Reader(url): - new_url = shapefile._replace_remote_url(url) + new_url = shapefile._replace_remote_url_with_localhost(url) print(f"repr(new_url): {repr(new_url)}") return shapefile.Reader(new_url) else: @@ -1296,12 +1296,12 @@ def test_reader_len_no_dbf_shx(): assert len(sf) == len(sf.shapes()) -def test_reader_corrupt_files(): +def test_reader_corrupt_files(tmp_path): """ Assert that reader is able to handle corrupt files by strictly going off the header information. """ - basename = "shapefiles/test/corrupt_too_long" + basename = str(tmp_path / "corrupt_too_long") # write a shapefile with junk byte data at end of files with shapefile.Writer(basename) as w: @@ -1540,7 +1540,7 @@ def test_reader_zip_polyylinez_no_m_itershaperecords(): Original source: https://github.com/OpenNHM/AvaFrameData/blob/main/avaPopeletzbach/ License CC-BY-4.0 """ - with shapefile.Reader("shapefiles/test/REL.zip/REL/releaseArea20090407") as sf: + with shapefile.Reader("shapefiles/REL.zip/REL/releaseArea20090407") as sf: for _shaperec in sf.iterShapeRecords(): pass From 29f043ca716f074fa59dedecdd94bbea64706aef Mon Sep 17 00:00:00 2001 From: James Parrott <80779630+JamesParrott@users.noreply.github.com> Date: Wed, 27 May 2026 17:18:57 +0100 Subject: [PATCH 2/2] Replace Github URLs of shapefiles with other victims. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 91e205e4..37e18442 100644 --- a/README.md +++ b/README.md @@ -440,7 +440,7 @@ Finally, you can use all of the above methods to read shapefiles directly from t >>> # from a zipped shapefile on website - >>> sf = shapefile.Reader("https://github.com/JamesParrott/PyShp_test_shapefile/raw/main/gis_osm_natural_a_free_1.zip") + >>> sf = shapefile.Reader("https://geodata.ucdavis.edu/gadm/gadm4.1/shp/gadm41_ATA_shp.zip") >>> # from a shapefile collection of files in a github repository >>> sf = shapefile.Reader("https://pubs.usgs.gov/of/2000/of00-006/gisdata/coverage/airports.shp") @@ -1305,7 +1305,7 @@ of records and complex geometries. As an example, let's load this Natural Earth shapefile of more than 4000 global administrative boundary polygons: - >>> sf = shapefile.Reader("https://github.com/nvkelso/natural-earth-vector/blob/master/10m_cultural/ne_10m_admin_1_states_provinces?raw=true") + >>> sf = shapefile.Reader("https://archive.org/download/ne_10m_admin_1_states_provinces/ne_10m_admin_1_states_provinces.zip") When first creating the Reader class, the library only reads the header information and leaves the rest of the file contents alone. Once you call the records() and shapes()