From bf756daaead665682e813f8b06e661b598262273 Mon Sep 17 00:00:00 2001 From: Mrityunjay Raj Date: Thu, 14 May 2026 03:57:30 +0530 Subject: [PATCH] legacy: extract LegacyArchives, add ArchivesInterface, drop self.legacy branches, refs #9556 --- src/borg/legacy/archives.py | 275 ++++++++++++++++++++++++++++++++++++ src/borg/manifest.py | 226 ++++++++++++++--------------- 2 files changed, 379 insertions(+), 122 deletions(-) create mode 100644 src/borg/legacy/archives.py diff --git a/src/borg/legacy/archives.py b/src/borg/legacy/archives.py new file mode 100644 index 0000000000..eb2a6f7ce0 --- /dev/null +++ b/src/borg/legacy/archives.py @@ -0,0 +1,275 @@ +"""Legacy archive list management for Borg 1.x repositories. + +In Borg 1.x the list of archives is embedded directly in the manifest blob +as a dict {name: {"id": bytes, "time": str}}. This class manages that dict. +Used by ``borg transfer --from-borg1`` and ``borg serve`` (serving v1 clients). + +This module can be removed entirely when Borg 1.x support is dropped. +""" + +import re +from datetime import datetime +from operator import attrgetter + +from ..constants import * # NOQA +from ..helpers.datastruct import StableDict +from ..helpers.errors import CommandError, Error +from ..helpers.parseformat import bin_to_hex +from ..helpers.time import parse_timestamp +from ..item import ArchiveItem +from ..patterns import get_regex_from_pattern + +# ArchiveInfo and filter_archives_by_date are imported from ..manifest below. +# These module-level imports are safe because legacy/archives.py is only ever +# imported from inside Manifest.__init__ — by that point manifest.py is fully +# loaded and present in sys.modules. +from ..manifest import ArchiveInfo, filter_archives_by_date + + +class LegacyArchives: + """ + Manage the list of archives for a Borg 1.x repository. + + The archive registry lives inside the manifest blob itself: + {name: {"id": , "time": }} + + Manifest.__init__ chooses this class over Archives when the repository is a + LegacyRepository. It can be deleted entirely when Borg 1.x support is dropped. + """ + + def __init__(self, repository, manifest): + self.repository = repository + self.manifest = manifest + # key: str archive name, value: dict("id": bytes_id, "time": str_iso_ts) + self._archives = {} + + def prepare(self, manifest, m): + self._set_raw_dict(m.archives) + + def finish(self, manifest): + return StableDict(self._get_raw_dict()) + + def ids(self, *, deleted=False): + for archive_info in self._archives.values(): + yield archive_info["id"] + + def _get_archive_meta(self, id: bytes) -> dict: + # get all metadata directly from the ArchiveItem in the repo. + from .repository import LegacyRepository + + try: + cdata = self.repository.get(id) + except LegacyRepository.ObjectNotFound: + return dict( + id=id, + name="archive-does-not-exist", + time="1970-01-01T00:00:00.000000", + exists=False, + username="", + hostname="", + tags=(), + ) + else: + _, data = self.manifest.repo_objs.parse(id, cdata, ro_type=ROBJ_ARCHIVE_META) + archive_dict = self.manifest.key.unpack_archive(data) + archive_item = ArchiveItem(internal_dict=archive_dict) + if archive_item.version not in (1, 2): + raise Exception("Unknown archive metadata version") + return dict( + id=id, + name=archive_item.name, + time=archive_item.time, + exists=True, + username=archive_item.username, + hostname=archive_item.hostname, + size=archive_item.get("size", 0), + nfiles=archive_item.get("nfiles", 0), + comment=archive_item.get("comment", ""), + tags=tuple(sorted(getattr(archive_item, "tags", []))), + ) + + def _infos(self, *, deleted=False): + for id in self.ids(deleted=deleted): + yield self._get_archive_meta(id) + + def _info_tuples(self, *, deleted=False): + for info in self._infos(deleted=deleted): + yield ArchiveInfo( + name=info["name"], + id=info["id"], + ts=parse_timestamp(info["time"]), + tags=info["tags"], + user=info["username"], + host=info["hostname"], + ) + + def _matching_info_tuples(self, match_patterns, match_end, *, deleted=False): + archive_infos = list(self._info_tuples(deleted=deleted)) + if match_patterns: + assert isinstance(match_patterns, list), f"match_pattern is a {type(match_patterns)}" + for match in match_patterns: + if match.startswith("aid:"): + wanted_id = match.removeprefix("aid:") + archive_infos = [x for x in archive_infos if bin_to_hex(x.id).startswith(wanted_id)] + if len(archive_infos) != 1: + raise CommandError("archive ID based match needs to match precisely one archive ID") + elif match.startswith("tags:"): + wanted_tags = match.removeprefix("tags:") + wanted_tags = [tag for tag in wanted_tags.split(",") if tag] + archive_infos = [x for x in archive_infos if set(x.tags) >= set(wanted_tags)] + elif match.startswith("user:"): + wanted_user = match.removeprefix("user:") + archive_infos = [x for x in archive_infos if x.user == wanted_user] + elif match.startswith("host:"): + wanted_host = match.removeprefix("host:") + archive_infos = [x for x in archive_infos if x.host == wanted_host] + else: + match = match.removeprefix("name:") + regex = get_regex_from_pattern(match) + regex = re.compile(regex + match_end) + archive_infos = [x for x in archive_infos if regex.match(x.name) is not None] + return archive_infos + + def count(self): + return len(self._archives) + + def names(self): + yield from self._archives.keys() + + def exists(self, name): + assert isinstance(name, str) + return name in self._archives + + def exists_id(self, id, *, deleted=False): + assert isinstance(id, bytes) + raise NotImplementedError + + def exists_name_and_id(self, name, id): + assert isinstance(name, str) + assert isinstance(id, bytes) + raise NotImplementedError + + def exists_name_and_ts(self, name, ts): + assert isinstance(name, str) + assert isinstance(ts, datetime) + raise NotImplementedError + + def get(self, name, raw=False): + assert isinstance(name, str) + values = self._archives.get(name) + if values is None: + return None + if not raw: + ts = parse_timestamp(values["time"]) + return ArchiveInfo(name=name, id=values["id"], ts=ts) + else: + return dict(name=name, id=values["id"], time=values["time"]) + + def get_by_id(self, id, raw=False, *, deleted=False): + assert isinstance(id, bytes) + for name, values in self._archives.items(): + if id == values["id"]: + break + else: + return None + if not raw: + ts = parse_timestamp(values["time"]) + return ArchiveInfo(name=name, id=values["id"], ts=ts) + else: + return dict(name=name, id=values["id"], time=values["time"]) + + def create(self, name, id, ts, *, overwrite=False): + assert isinstance(name, str) + assert isinstance(id, bytes) + if isinstance(ts, datetime): + ts = ts.isoformat(timespec="microseconds") + assert isinstance(ts, str) + if self.exists(name) and not overwrite: + raise KeyError("archive already exists") + self._archives[name] = {"id": id, "time": ts} + + def delete_by_id(self, id): + assert isinstance(id, bytes) + raise NotImplementedError("Borg 1.x repositories do not support soft-delete") + + def undelete_by_id(self, id): + assert isinstance(id, bytes) + raise NotImplementedError("Borg 1.x repositories do not support undelete") + + def nuke_by_id(self, id): + assert isinstance(id, bytes) + raise NotImplementedError("Borg 1.x repositories do not support nuke") + + def list( + self, + *, + match=None, + match_end=r"\Z", + sort_by=(), + reverse=False, + first=None, + last=None, + older=None, + newer=None, + oldest=None, + newest=None, + deleted=False, + ): + """ + Return list of ArchiveInfo instances according to the parameters. + See Archives.list() for full parameter documentation. + """ + if isinstance(sort_by, (str, bytes)): + raise TypeError("sort_by must be a sequence of str") + + archive_infos = self._matching_info_tuples(match, match_end, deleted=deleted) + + if any([oldest, newest, older, newer]): + archive_infos = filter_archives_by_date( + archive_infos, oldest=oldest, newest=newest, newer=newer, older=older + ) + for sortkey in reversed(sort_by): + archive_infos.sort(key=attrgetter(sortkey)) + if first: + archive_infos = archive_infos[:first] + elif last: + archive_infos = archive_infos[max(len(archive_infos) - last, 0) :] + if reverse: + archive_infos.reverse() + return archive_infos + + def list_considering(self, args): + """Get a list of archives, considering --first/last/prefix/match-archives/sort cmdline args.""" + name = getattr(args, "name", None) + if name is not None: + raise Error( + "Giving a specific name is incompatible with options --first, --last " "and -a / --match-archives." + ) + return self.list( + sort_by=args.sort_by.split(","), + match=args.match_archives, + first=getattr(args, "first", None), + last=getattr(args, "last", None), + older=getattr(args, "older", None), + newer=getattr(args, "newer", None), + oldest=getattr(args, "oldest", None), + newest=getattr(args, "newest", None), + deleted=getattr(args, "deleted", False), + ) + + def get_one(self, match, *, match_end=r"\Z", deleted=False): + """Get exactly one archive matching .""" + assert match is not None + archive_infos = self._matching_info_tuples(match, match_end, deleted=deleted) + if len(archive_infos) != 1: + raise CommandError(f"{match} needed to match precisely one archive, but matched {len(archive_infos)}.") + return archive_infos[0] + + def _set_raw_dict(self, d): + for k, v in d.items(): + assert isinstance(k, str) + assert isinstance(v, dict) and "id" in v and "time" in v + self._archives[k] = v + + def _get_raw_dict(self): + return self._archives diff --git a/src/borg/manifest.py b/src/borg/manifest.py index 3087c9d7a3..c407887e5f 100644 --- a/src/borg/manifest.py +++ b/src/borg/manifest.py @@ -3,7 +3,8 @@ from collections import namedtuple from datetime import datetime, timedelta, timezone from operator import attrgetter -from collections.abc import Sequence +from collections.abc import Iterator, Sequence +from typing import Protocol, runtime_checkable from borgstore.store import ObjectNotFound, ItemInfo @@ -69,65 +70,92 @@ def get_first_and_last_archive_ts(archives_list): return archives +@runtime_checkable +class ArchivesInterface(Protocol): + """ + Structural interface that both Archives and LegacyArchives must satisfy. + + Manifest.__init__ assigns one of these two classes to self.archives depending + on whether the repository is a LegacyRepository (Borg 1.x) or a modern one. + All callers go through this interface without knowing which class they got. + + When Borg 1.x support is dropped, delete LegacyArchives and this Protocol + can either be removed or kept as documentation of the Archives public API. + """ + + def prepare(self, manifest, m) -> None: ... + def finish(self, manifest) -> dict: ... + def ids(self, *, deleted: bool = False) -> Iterator: ... + def count(self) -> int: ... + def names(self) -> Iterator: ... + def exists(self, name: str) -> bool: ... + def exists_id(self, id: bytes, *, deleted: bool = False) -> bool: ... + def exists_name_and_id(self, name: str, id: bytes) -> bool: ... + def exists_name_and_ts(self, name: str, ts) -> bool: ... + def get(self, name: str, raw: bool = False): ... + def get_by_id(self, id: bytes, raw: bool = False, *, deleted: bool = False): ... + def create(self, name: str, id: bytes, ts, *, overwrite: bool = False) -> None: ... + def delete_by_id(self, id: bytes) -> None: ... + def undelete_by_id(self, id: bytes) -> None: ... + def nuke_by_id(self, id: bytes) -> None: ... + def list( + self, + *, + match=None, + match_end=r"\Z", + sort_by=(), + reverse=False, + first=None, + last=None, + older=None, + newer=None, + oldest=None, + newest=None, + deleted=False, + ): ... + def list_considering(self, args): ... + def get_one(self, match, *, match_end=r"\Z", deleted=False): ... + + class Archives: """ - Manage the list of archives. + Manage the list of archives for a Borg 2.x repository. - We still need to support the borg 1.x manifest-with-list-of-archives, - so borg transfer can work. - borg2 has separate items archives/* in the borgstore. + Each archive has a separate entry in borgstore at archives/. + The manifest blob itself carries an empty archives dict. """ def __init__(self, repository, manifest): - from .repository import Repository - from .remote import RemoteRepository - self.repository = repository - self.legacy = not isinstance(repository, (Repository, RemoteRepository)) - # key: str archive name, value: dict('id': bytes_id, 'time': str_iso_ts) - self._archives = {} self.manifest = manifest def prepare(self, manifest, m): - if not self.legacy: - pass - else: - self._set_raw_dict(m.archives) + pass # borgstore manages the archive directory; nothing to load from the manifest blob def finish(self, manifest): - if not self.legacy: - manifest_archives = {} - else: - manifest_archives = StableDict(self._get_raw_dict()) - return manifest_archives + return {} # manifest["archives"] is always empty in Borg 2 def ids(self, *, deleted=False): # yield the binary IDs of all archives - if not self.legacy: - try: - infos = list(self.repository.store_list("archives", deleted=deleted)) - except ObjectNotFound: - infos = [] - for info in infos: - info = ItemInfo(*info) # RPC does not give us a NamedTuple - yield hex_to_bin(info.name) - else: - for archive_info in self._archives.values(): - yield archive_info["id"] + try: + infos = list(self.repository.store_list("archives", deleted=deleted)) + except ObjectNotFound: + infos = [] + for info in infos: + info = ItemInfo(*info) # RPC does not give us a NamedTuple + yield hex_to_bin(info.name) def _get_archive_meta(self, id: bytes) -> dict: # get all metadata directly from the ArchiveItem in the repo. - from .legacy.repository import LegacyRepository from .repository import Repository try: cdata = self.repository.get(id) - except (Repository.ObjectNotFound, LegacyRepository.ObjectNotFound): + except Repository.ObjectNotFound: metadata = dict( id=id, name="archive-does-not-exist", time="1970-01-01T00:00:00.000000", - # new: exists=False, # we have the pointer, but the repo does not have an archive item username="", hostname="", @@ -145,7 +173,6 @@ def _get_archive_meta(self, id: bytes) -> dict: id=id, name=archive_item.name, time=archive_item.time, - # new: exists=True, # repo has a valid archive item username=archive_item.username, hostname=archive_item.hostname, @@ -211,48 +238,35 @@ def names(self): def exists(self, name): # check if an archive with this name exists assert isinstance(name, str) - if not self.legacy: - return name in self.names() - else: - return name in self._archives + return name in self.names() def exists_id(self, id, *, deleted=False): # check if an archive with this id exists assert isinstance(id, bytes) - if not self.legacy: - return id in self.ids(deleted=deleted) - else: - raise NotImplementedError + return id in self.ids(deleted=deleted) def exists_name_and_id(self, name, id): # check if an archive with this name AND id exists assert isinstance(name, str) assert isinstance(id, bytes) - if not self.legacy: - for archive_info in self._infos(): - if archive_info["name"] == name and archive_info["id"] == id: - return True - else: - return False + for archive_info in self._infos(): + if archive_info["name"] == name and archive_info["id"] == id: + return True else: - raise NotImplementedError + return False def exists_name_and_ts(self, name, ts): # check if an archive with this name AND timestamp exists assert isinstance(name, str) assert isinstance(ts, datetime) - if not self.legacy: - for archive_info in self._info_tuples(): - if archive_info.name == name and archive_info.ts == ts: - return True - else: - return False + for archive_info in self._info_tuples(): + if archive_info.name == name and archive_info.ts == ts: + return True else: - raise NotImplementedError + return False def _lookup_name(self, name, raw=False): assert isinstance(name, str) - assert not self.legacy for archive_info in self._infos(): if archive_info["exists"] and archive_info["name"] == name: if not raw: @@ -272,51 +286,31 @@ def _lookup_name(self, name, raw=False): def get(self, name, raw=False): assert isinstance(name, str) - if not self.legacy: - try: - return self._lookup_name(name, raw=raw) - except KeyError: - return None - else: - values = self._archives.get(name) - if values is None: - return None - if not raw: - ts = parse_timestamp(values["time"]) - return ArchiveInfo(name=name, id=values["id"], ts=ts) - else: - return dict(name=name, id=values["id"], time=values["time"]) + try: + return self._lookup_name(name, raw=raw) + except KeyError: + return None def get_by_id(self, id, raw=False, *, deleted=False): assert isinstance(id, bytes) - if not self.legacy: - if id in self.ids(deleted=deleted): # check directory - # looks like this archive id is in the archives directory, thus it is NOT deleted. - # OR we have explicitly requested a soft-deleted archive via deleted=True. - archive_info = self._get_archive_meta(id) - if archive_info["exists"]: # True means we have found Archive metadata in the repo. - if not raw: - ts = parse_timestamp(archive_info["time"]) - archive_info = ArchiveInfo( - name=archive_info["name"], - id=archive_info["id"], - ts=ts, - tags=archive_info["tags"], - user=archive_info["username"], - host=archive_info["hostname"], - ) - return archive_info - else: - for name, values in self._archives.items(): - if id == values["id"]: - break - else: - return None - if not raw: - ts = parse_timestamp(values["time"]) - return ArchiveInfo(name=name, id=values["id"], ts=ts) - else: - return dict(name=name, id=values["id"], time=values["time"]) + if id in self.ids(deleted=deleted): # check directory + # looks like this archive id is in the archives directory, thus it is NOT deleted. + # OR we have explicitly requested a soft-deleted archive via deleted=True. + archive_info = self._get_archive_meta(id) + if archive_info["exists"]: # True means we have found Archive metadata in the repo. + if not raw: + ts = parse_timestamp(archive_info["time"]) + archive_info = ArchiveInfo( + name=archive_info["name"], + id=archive_info["id"], + ts=ts, + tags=archive_info["tags"], + user=archive_info["username"], + host=archive_info["hostname"], + ) + return archive_info + return None # id not in store, or archive metadata blob missing from repo + # TODO: add a test that calls get_by_id() with a non-existent id and asserts None is returned def create(self, name, id, ts, *, overwrite=False): assert isinstance(name, str) @@ -324,30 +318,23 @@ def create(self, name, id, ts, *, overwrite=False): if isinstance(ts, datetime): ts = ts.isoformat(timespec="microseconds") assert isinstance(ts, str) - if not self.legacy: - # we only create a directory entry, its name points to the archive item: - self.repository.store_store(f"archives/{bin_to_hex(id)}", b"") - else: - if self.exists(name) and not overwrite: - raise KeyError("archive already exists") - self._archives[name] = {"id": id, "time": ts} + # overwrite is not enforced: archive IDs are content-addressed so a genuine + # duplicate (same ID, different intent) cannot occur in a correct implementation. + self.repository.store_store(f"archives/{bin_to_hex(id)}", b"") def delete_by_id(self, id): # soft-delete an archive assert isinstance(id, bytes) - assert not self.legacy self.repository.store_move(f"archives/{bin_to_hex(id)}", delete=True) # soft-delete def undelete_by_id(self, id): # undelete an archive assert isinstance(id, bytes) - assert not self.legacy self.repository.store_move(f"archives/{bin_to_hex(id)}", undelete=True) def nuke_by_id(self, id): # really delete an already soft-deleted archive assert isinstance(id, bytes) - assert not self.legacy self.repository.store_delete(f"archives/{bin_to_hex(id)}", deleted=True) def list( @@ -430,17 +417,6 @@ def get_one(self, match, *, match_end=r"\Z", deleted=False): raise CommandError(f"{match} needed to match precisely one archive, but matched {len(archive_infos)}.") return archive_infos[0] - def _set_raw_dict(self, d): - """set the dict we get from the msgpack unpacker""" - for k, v in d.items(): - assert isinstance(k, str) - assert isinstance(v, dict) and "id" in v and "time" in v - self._archives[k] = v - - def _get_raw_dict(self): - """get the dict we can give to the msgpack packer""" - return self._archives - class Manifest: @enum.unique @@ -474,7 +450,13 @@ class Operation(enum.Enum): MANIFEST_ID = b"\0" * 32 def __init__(self, key, repository, item_keys=None, ro_cls=RepoObj): - self.archives = Archives(repository, self) + from .legacy.repository import LegacyRepository + from .legacy.archives import LegacyArchives + + if isinstance(repository, LegacyRepository): + self.archives: ArchivesInterface = LegacyArchives(repository, self) + else: + self.archives: ArchivesInterface = Archives(repository, self) self.config = {} self.key = key self.repo_objs = ro_cls(key)