Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions sagemaker-core/src/sagemaker/core/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1688,7 +1688,11 @@ def _is_bad_path(path, base):
bool: True if the path is not rooted under the base directory, False otherwise.
"""
# joinpath will ignore base if path is absolute
return not _get_resolved_path(joinpath(base, path)).startswith(base)
resolved = _get_resolved_path(joinpath(base, path))
Comment thread
aviruthen marked this conversation as resolved.
Comment thread
aviruthen marked this conversation as resolved.
Comment thread
aviruthen marked this conversation as resolved.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The os.path.commonpath fix is a good security improvement over startswith, which can be tricked by paths like /base/x2 matching /base/x. However, there's a subtle issue: os.path.commonpath normalizes paths, but base may not be normalized in the same way. You should ensure both resolved and base are consistently normalized before comparison. Consider:

resolved = _get_resolved_path(joinpath(base, path))
base = _get_resolved_path(base)  # ensure consistent normalization
try:
    return os.path.commonpath([resolved, base]) != base
except ValueError:
    return True

Otherwise, if base has a trailing slash or different normalization than what commonpath returns, the comparison could give incorrect results.

try:
return os.path.commonpath([resolved, base]) != base
except ValueError:
return True # If we can't determine safety, treat as bad path


def _is_bad_link(info, base):
Expand All @@ -1704,23 +1708,26 @@ def _is_bad_link(info, base):
bool: True if the link is not rooted under the base directory, False otherwise.
"""
# Links are interpreted relative to the directory containing the link
# Wrap with _get_resolved_path to ensure consistent normalization for commonpath comparison
tip = _get_resolved_path(joinpath(base, dirname(info.name)))
return _is_bad_path(info.linkname, base=tip)


Comment thread
aviruthen marked this conversation as resolved.
def _get_safe_members(members):
def _get_safe_members(members, base=None):
"""A generator that yields members that are safe to extract.

It filters out bad paths and bad links.

Args:
members (list): A list of members to check.
members (list): A list of TarInfo members to check.
base (str): The base directory for extraction. If None, defaults to the
current working directory (for backward compatibility).

Comment thread
aviruthen marked this conversation as resolved.
Yields:
tarfile.TarInfo: The tar file info.
"""
base = _get_resolved_path("")

if base is None:
base = _get_resolved_path("")
for file_info in members:
if _is_bad_path(file_info.name, base):
logger.error("%s is blocked (illegal path)", file_info.name)
Expand Down Expand Up @@ -1783,7 +1790,8 @@ def custom_extractall_tarfile(tar, extract_path):
if hasattr(tarfile, "data_filter"):
tar.extractall(path=extract_path, filter="data")
else:
tar.extractall(path=extract_path, members=_get_safe_members(tar))
base = _get_resolved_path(extract_path)
Comment thread
aviruthen marked this conversation as resolved.
tar.extractall(path=extract_path, members=_get_safe_members(tar.getmembers(), base))
# Re-validate extracted paths to catch symlink race conditions
_validate_extracted_paths(extract_path)

Expand Down
10 changes: 10 additions & 0 deletions sagemaker-core/src/sagemaker/core/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,19 @@
for backward compatibility and convenience.

Note: Uses lazy imports via __getattr__ to avoid circular import issues.

Comment thread
aviruthen marked this conversation as resolved.
Private tar extraction safety helpers (_get_resolved_path, _is_bad_path,
_is_bad_link, _get_safe_members, _validate_extracted_paths) are internal
implementation details and are NOT re-exported from this package. They can
be imported directly from sagemaker.core.common_utils if needed.

custom_extractall_tarfile is the public entry point for safe tar extraction.
"""
from __future__ import absolute_import

# Public API surface.
# Note: _save_model is underscore-prefixed but was already in __all__ (pre-existing).
Comment thread
aviruthen marked this conversation as resolved.
# custom_extractall_tarfile is the main public entry point for safe tar extraction.
__all__ = [
"_save_model",
"download_file_from_url",
Expand Down
Loading
Loading