From a1c1a0ef0773d1921058de760d00b65b6b395472 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 2 Jun 2026 16:47:42 +0530 Subject: [PATCH 1/6] Move --ignore functionality to codebase creation Deprecate --ingore and --include pre-scan plugins and move the ignore/include functionality to codebase import stage to get rid of multiple codebase walks. Signed-off-by: Ayan Sinha Mahapatra --- pyproject-scancode-toolkit-mini.toml | 1 - pyproject-scancode-toolkit.toml | 1 - pyproject.toml | 1 - src/commoncode/resource.py | 85 ++++++++++++--- src/scancode/cli.py | 34 +++++- src/scancode/plugin_ignore.py | 138 ++++++++++-------------- tests/commoncode/test_fileset.py | 5 + tests/scancode/data/help/help_linux.txt | 4 +- tests/scancode/test_plugin_ignore.py | 29 +---- 9 files changed, 170 insertions(+), 128 deletions(-) diff --git a/pyproject-scancode-toolkit-mini.toml b/pyproject-scancode-toolkit-mini.toml index 06ad0bfdd3f..f0a5696872e 100644 --- a/pyproject-scancode-toolkit-mini.toml +++ b/pyproject-scancode-toolkit-mini.toml @@ -256,7 +256,6 @@ scancode-train-gibberish-model = "textcode.train_gibberish_model:train_gibberish # scancode_pre_scan is the entry point for pre_scan plugins executed before the # scans. See also plugincode.pre_scan module for details and doc. [project.entry-points.scancode_pre_scan] -ignore = "scancode.plugin_ignore:ProcessIgnore" facet = "summarycode.facet:AddFacet" diff --git a/pyproject-scancode-toolkit.toml b/pyproject-scancode-toolkit.toml index ade84a60c2f..6ee2048f76d 100644 --- a/pyproject-scancode-toolkit.toml +++ b/pyproject-scancode-toolkit.toml @@ -257,7 +257,6 @@ scancode-train-gibberish-model = "textcode.train_gibberish_model:train_gibberish # scancode_pre_scan is the entry point for pre_scan plugins executed before the # scans. See also plugincode.pre_scan module for details and doc. [project.entry-points.scancode_pre_scan] -ignore = "scancode.plugin_ignore:ProcessIgnore" facet = "summarycode.facet:AddFacet" diff --git a/pyproject.toml b/pyproject.toml index 8c90c9078db..a74f58405a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -262,7 +262,6 @@ scancode-train-gibberish-model = "textcode.train_gibberish_model:train_gibberish # scancode_pre_scan is the entry point for pre_scan plugins executed before the # scans. See also plugincode.pre_scan module for details and doc. [project.entry-points.scancode_pre_scan] -ignore = "scancode.plugin_ignore:ProcessIgnore" facet = "summarycode.facet:AddFacet" diff --git a/src/commoncode/resource.py b/src/commoncode/resource.py index d19e2da842d..19bfa955e36 100644 --- a/src/commoncode/resource.py +++ b/src/commoncode/resource.py @@ -39,6 +39,7 @@ from commoncode.datautils import List from commoncode.datautils import Mapping from commoncode.datautils import String +from commoncode.fileset import is_included from commoncode.filetype import is_file as filetype_is_file from commoncode.filetype import is_special from commoncode.fileutils import as_posixpath @@ -62,7 +63,7 @@ # Tracing flags TRACE = False -TRACE_DEEP = False +TRACE_DEEP = True def logger_debug(*args): @@ -98,7 +99,7 @@ def skip_ignored(location): if TRACE_DEEP: logger_debug() logger_debug( - "Codebase.populate: walk: ignored loc:", + "Codebase.populate: walk: skip_ignored:", location, "ignored:", ignored(location), @@ -109,6 +110,42 @@ def skip_ignored(location): return is_special(location) or ignored(location) +def is_ignored(location, includes=None, excludes=None): + + excludes = { + pattern: 'User ignore: Supplied by --ignore' for pattern in excludes + } + + includes = { + pattern: 'User include: Supplied by --include' for pattern in includes + } + + included_from_options = is_included( + path=location, + includes=includes, + excludes=excludes, + ) + + if TRACE_DEEP: + logger_debug( + "Codebase.populate: walk: is_ignored:", + "is_ignored: location:", + location, + "included_from_options:", + included_from_options, + "skip_ignored", + skip_ignored(location) + ) + + if skip_ignored(location) or not included_from_options: + if TRACE_DEEP: + logger_debug("is_ignored: location:", location, "is_skipped",) + + return True + + return False + + def depth_walk( root_location, max_depth, @@ -202,6 +239,8 @@ class Codebase: __slots__ = ( "max_depth", "location", + "includes", + "ignores", "has_single_resource", "resource_attributes", "resource_class", @@ -236,6 +275,8 @@ def __init__( max_in_memory=10000, max_depth=0, paths=tuple(), + ignores=tuple(), + includes=tuple(), *args, **kwargs, ): @@ -298,6 +339,8 @@ def __init__( # finally populate self.paths = self._prepare_clean_paths(paths) + self.ignores = ignores + self.includes = includes self._populate() def _prepare_clean_paths(self, paths=tuple()): @@ -461,11 +504,17 @@ def _populate(self): return if self.paths: - return self._create_resources_from_paths(root=root, paths=self.paths) + # In case of a list of full paths, we create resources without walking + return self._create_resources_from_full_paths(root=root, paths=self.paths) + # In case we have multiple else: - return self._create_resources_from_root(root=root) + return self._create_resources_from_root( + root=root, + includes=self.includes, + ignores=self.ignores, + ) - def _create_resources_from_paths(self, root, paths): + def _create_resources_from_full_paths(self, root, paths): # without paths we iterate the provided paths. We report an error # if a path is missing on disk. @@ -483,22 +532,21 @@ def _create_resources_from_paths(self, root, paths): msg = f"ERROR: cannot populate codebase: path: {path!r} not found in {res_loc!r}" self.errors.append(msg) raise Exception(path, join(base_location, path)) - continue # create all parents. The last parent is the one we want to use parent = root if TRACE: - logger_debug("Codebase._create_resources_from_paths: parent", parent) + logger_debug("Codebase._create_resources_from_full_paths: parent", parent) for parent_path in get_ancestor_paths(path, include_self=False): if TRACE: logger_debug( - f" Codebase._create_resources_from_paths: parent_path: {parent_path!r}" + f" Codebase._create_resources_from_full_paths: parent_path: {parent_path!r}" ) if not parent_path: continue newpar = parents_by_path.get(parent_path) if TRACE: - logger_debug(" Codebase._create_resources_from_paths: newpar", repr(newpar)) + logger_debug(" Codebase._create_resources_from_full_paths: newpar", repr(newpar)) if not newpar: newpar = self._get_or_create_resource( @@ -509,7 +557,7 @@ def _create_resources_from_paths(self, root, paths): ) if not newpar: raise Exception( - "ERROR: Codebase._create_resources_from_paths:" + "ERROR: Codebase._create_resources_from_full_paths:" f" cannot create parent for: {parent_path!r}" ) parent = newpar @@ -518,7 +566,7 @@ def _create_resources_from_paths(self, root, paths): if TRACE: logger_debug( - f" Codebase._create_resources_from_paths:", + f" Codebase._create_resources_from_full_paths:", f"created newpar: {newpar!r}", ) @@ -529,10 +577,10 @@ def _create_resources_from_paths(self, root, paths): is_file=isfile(res_loc), ) if TRACE: - logger_debug("Codebase._create_resources_from_paths: resource", res) + logger_debug("Codebase._create_resources_from_full_paths: resource", res) - def _create_resources_from_root(self, root): - # without paths we walks the root location top-down + def _create_resources_from_root(self, root, includes, ignores): + # without paths we walk the root location top-down # track resources parents by location during construction. # NOTE: this cannot exhaust memory on a large codebase, because we do @@ -545,9 +593,15 @@ def err(_error): f"ERROR: cannot populate codebase: {_error}\n{traceback.format_exc()}" ) + skip_ignored = partial(is_ignored, includes=includes, excludes=ignores) + + if TRACE_DEEP: + logger_debug(f"parents_by_loc: {parents_by_loc}, ignores: {ignores}, includes: {includes}") + # Walk over the directory and build the resource tree for top, dirs, files in depth_walk( root_location=root.location, + skip_ignored=skip_ignored, max_depth=self.max_depth, error_handler=err, ): @@ -557,6 +611,7 @@ def err(_error): top=top, dirs=dirs, files=files, + skip_ignored=skip_ignored, ): # on the plain, bare FS, files cannot be parents if not created.is_file: @@ -574,6 +629,8 @@ def _create_resources(self, parent, top, dirs, files, skip_ignored=skip_ignored) for name in names: location = join(top, name) if skip_ignored(location): + if TRACE_DEEP: + logger_debug(f"_create_resources, depth_walk loop: ignored location: {location}") continue res = self._get_or_create_resource( name=name, diff --git a/src/scancode/cli.py b/src/scancode/cli.py index 1376c6cfee9..ca4d279eb0e 100644 --- a/src/scancode/cli.py +++ b/src/scancode/cli.py @@ -221,6 +221,26 @@ def default_processes(): callback=validate_input_path, type=click.Path(exists=True, readable=True, path_type=str)) +@click.option('--include', + multiple=True, + default=None, + metavar='', + help='Include files matching .', + sort_order=11, + help_group=cliutils.CORE_GROUP, + cls=PluggableCommandLineOption, +) + +@click.option('--ignore', + multiple=True, + default=None, + metavar='', + help='Ignore files matching .', + sort_order=10, + help_group=cliutils.CORE_GROUP, + cls=PluggableCommandLineOption, +) + @click.option('--strip-root', is_flag=True, default=False, @@ -395,6 +415,8 @@ def default_processes(): def scancode( ctx, input, # NOQA + include, + ignore, strip_root, full_root, processes, @@ -505,6 +527,8 @@ def scancode( # run proper success, _results = run_scan( input=input, + include=include, + ignore=ignore, from_json=from_json, strip_root=strip_root, full_root=full_root, @@ -545,7 +569,9 @@ def scancode( def run_scan( - input, # NOQA + input, # + include=[], + ignore=[], from_json=False, strip_root=False, full_root=False, @@ -644,12 +670,10 @@ def echo_func(*_args, **_kwargs): # and we craft a list of synthetic --include path pattern options from # the input list of paths included_paths = [as_posixpath(path).rstrip('/') for path in input] - # FIXME: this is a hack as this "include" is from an external plugin!!! - include = list(requested_options.get('include', []) or []) include.extend(included_paths) - requested_options['include'] = include # ... and use the common prefix as our new input + # FIXME: we should not walk outside inputs input = common_prefix # NOQA # build mappings of all options to pass down to plugins @@ -894,6 +918,8 @@ def echo_func(*_args, **_kwargs): try: codebase = codebase_class( location=input, + includes=include, + ignores=ignore, resource_attributes=resource_attributes, codebase_attributes=codebase_attributes, full_root=full_root, diff --git a/src/scancode/plugin_ignore.py b/src/scancode/plugin_ignore.py index 70b0e30b10b..3b1b3a06ed0 100644 --- a/src/scancode/plugin_ignore.py +++ b/src/scancode/plugin_ignore.py @@ -37,87 +37,63 @@ def logger_debug(*args): return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args)) -@pre_scan_impl -class ProcessIgnore(PreScanPlugin): +def process_codebase(codebase, ignore=(), include=(), **kwargs): """ - Include or ignore files matching patterns. + WARNING: DEPRECATED, ignore/include moved to codebase import + step in core plugins. + Keep only included and non-ignored Resources in the codebase. """ - options = [ - PluggableCommandLineOption(('--ignore',), - multiple=True, - default=None, - metavar='', - help='Ignore files matching .', - sort_order=10, - help_group=PRE_SCAN_GROUP), - PluggableCommandLineOption(('--include',), - multiple=True, - default=None, - metavar='', - help='Include files matching .', - sort_order=11, - help_group=PRE_SCAN_GROUP) - ] - - def is_enabled(self, ignore, include, **kwargs): - return ignore or include - - def process_codebase(self, codebase, ignore=(), include=(), **kwargs): - """ - Keep only included and non-ignored Resources in the codebase. - """ - - if not (ignore or include): - return - - excludes = { - pattern: 'User ignore: Supplied by --ignore' for pattern in ignore - } - - includes = { - pattern: 'User include: Supplied by --include' for pattern in include - } - - included = partial(is_included, includes=includes, excludes=excludes) - - paths_to_remove = set() - paths_to_remove_add = paths_to_remove.add - paths_to_remove_discard = paths_to_remove.discard - - # Walk codebase top-down to collect the paths of Resources to remove. - for resource in codebase.walk(topdown=True): - if resource.is_root: - continue - - resource_path = resource.path - - if not included(resource_path): - for child in resource.children(codebase): - paths_to_remove_add(child.path) - paths_to_remove_add(resource_path) - else: - # we may have been selected for removal based on a parent dir - # but may be explicitly included. Honor that - paths_to_remove_discard(resource_path) - - if TRACE: - logger_debug('process_codebase: paths_to_remove') - logger_debug(paths_to_remove) - for path in sorted(paths_to_remove): - logger_debug(codebase.get_resource(path)) - - remove_resource = codebase.remove_resource - - # Then, walk bottom-up and remove the non-included Resources from the - # Codebase if the Resource path is in our list of paths to remove. - for resource in codebase.walk(topdown=False): - resource_path = resource.path - if resource.is_root: - continue - # removing dirs will also remove its files - if resource.is_dir: - continue - if resource_path in paths_to_remove: - paths_to_remove_discard(resource_path) - remove_resource(resource) + if not (ignore or include): + return + + excludes = { + pattern: 'User ignore: Supplied by --ignore' for pattern in ignore + } + + includes = { + pattern: 'User include: Supplied by --include' for pattern in include + } + + included = partial(is_included, includes=includes, excludes=excludes) + + paths_to_remove = set() + paths_to_remove_add = paths_to_remove.add + paths_to_remove_discard = paths_to_remove.discard + + # Walk codebase top-down to collect the paths of Resources to remove. + for resource in codebase.walk(topdown=True): + if resource.is_root: + continue + + resource_path = resource.path + + if not included(resource_path): + for child in resource.children(codebase): + paths_to_remove_add(child.path) + paths_to_remove_add(resource_path) + else: + # we may have been selected for removal based on a parent dir + # but may be explicitly included. Honor that + paths_to_remove_discard(resource_path) + + if TRACE: + logger_debug('process_codebase: paths_to_remove') + logger_debug(paths_to_remove) + for path in sorted(paths_to_remove): + logger_debug(codebase.get_resource(path)) + + remove_resource = codebase.remove_resource + + # Then, walk bottom-up and remove the non-included Resources from the + # Codebase if the Resource path is in our list of paths to remove. + for resource in codebase.walk(topdown=False): + resource_path = resource.path + if resource.is_root: + continue + # removing dirs will also remove its files + if resource.is_dir: + continue + if resource_path in paths_to_remove: + paths_to_remove_discard(resource_path) + remove_resource(resource) diff --git a/tests/commoncode/test_fileset.py b/tests/commoncode/test_fileset.py index 2c6e5ef72aa..4d5f639b99d 100644 --- a/tests/commoncode/test_fileset.py +++ b/tests/commoncode/test_fileset.py @@ -55,6 +55,11 @@ def test_is_included_is_included_exclusions_2(self): assert fileset.is_included("/some/src/this/that", incs, excs) assert not fileset.is_included("/src/dist/build/mylib.so", incs, excs) + def test_is_included_is_included_inside_exclusions(self): + incs = {"/src/*.so": ".scanignore"} + excs = {"/src/*": ".scanignore"} + assert not fileset.is_included("/src/dist/build/mylib.so", incs, excs) + def test_is_included_empty_exclusions(self): incs = {"/src/*": ".scanignore"} excs = {"": ".scanignore"} diff --git a/tests/scancode/data/help/help_linux.txt b/tests/scancode/data/help/help_linux.txt index 6794b19d602..855b7c7959d 100644 --- a/tests/scancode/data/help/help_linux.txt +++ b/tests/scancode/data/help/help_linux.txt @@ -94,8 +94,6 @@ Options: such that all paths have a common root directory. pre-scan: - --ignore Ignore files matching . - --include Include files matching . --facet = Add the to files with a path matching . @@ -140,11 +138,13 @@ Options: which are todo items and needs manual review. core: + --ignore Ignore files matching . --timeout Stop an unfinished file scan after a timeout in seconds. [default: 120 seconds] -n, --processes INT Set the number of parallel processes to use. Disable parallel processing if 0. Also disable threading if -1. [default: (number of CPUs)-1] + --include Include files matching . -q, --quiet Do not print summary or progress. -v, --verbose Print progress as file-by-file path instead of a progress bar. Print verbose scan counters. diff --git a/tests/scancode/test_plugin_ignore.py b/tests/scancode/test_plugin_ignore.py index 78f2954d76b..db739db88a7 100644 --- a/tests/scancode/test_plugin_ignore.py +++ b/tests/scancode/test_plugin_ignore.py @@ -14,7 +14,6 @@ from commoncode.fileset import is_included from scancode.cli_test_utils import run_scan_click from scancode.cli_test_utils import load_json_result -from scancode.plugin_ignore import ProcessIgnore from commoncode.resource import Codebase @@ -48,15 +47,13 @@ def test_is_included_glob_file(self): assert not is_included(location, excludes=excludes) def check_ProcessIgnore(self, test_dir, expected, ignore, include=()): - codebase = Codebase(test_dir) - test_plugin = ProcessIgnore() - test_plugin.process_codebase(codebase, ignore=ignore, include=include) + codebase = Codebase(location=test_dir, ignores=ignore, includes=include) resources = [res.strip_root_path for res in codebase.walk(skip_root=True)] assert sorted(resources) == expected def test_ProcessIgnore_with_single_file(self): test_dir = self.extract_test_tar('plugin_ignore/user.tgz') - ignore = ('sample.doc',) + ignore = ('*sample.doc',) expected = [ 'user', 'user/ignore.doc', @@ -69,7 +66,7 @@ def test_ProcessIgnore_with_single_file(self): def test_ProcessIgnore_with_multiple_files(self): test_dir = self.extract_test_tar('plugin_ignore/user.tgz') - ignore = ('ignore.doc', 'sample.doc',) + ignore = ('*ignore.doc', '*sample.doc',) expected = [ 'user', 'user/src', @@ -111,25 +108,10 @@ def test_ProcessIgnore_with_multiple_ignores(self): ] self.check_ProcessIgnore(test_dir, expected, ignore) - def test_ProcessIgnore_include_with_glob_for_extension(self): - test_dir = self.extract_test_tar('plugin_ignore/user.tgz') - include = ('*.doc',) - expected = [ - 'user', - 'user/ignore.doc', - 'user/src', - 'user/src/ignore.doc', - 'user/src/test', - 'user/src/test/sample.doc', - ] - self.check_ProcessIgnore(test_dir, expected, ignore=(), include=include) - def test_ProcessIgnore_process_codebase_does_not_fail_to_access_an_ignored_resourced_cached_to_disk(self): test_dir = self.extract_test_tar('plugin_ignore/user.tgz') - codebase = Codebase(test_dir, max_in_memory=1) - test_plugin = ProcessIgnore() ignore = ['test'] - test_plugin.process_codebase(codebase, ignore=ignore) + Codebase(location=test_dir, max_in_memory=1, ignores=ignore) class TestScanPluginIgnoreFiles(FileDrivenTesting): @@ -241,7 +223,7 @@ def test_scancode_multiple_ignores(self): def test_scancode_codebase_attempt_to_access_an_ignored_resourced_cached_to_disk(self): test_dir = self.extract_test_tar('plugin_ignore/user.tgz') result_file = self.get_temp_file('json') - args = ['--copyright', '--strip-root', '--ignore', 'test', test_dir, '--max-in-memory', '1', '--json', result_file] + args = ['--copyright', '--strip-root', '--ignore', '*test', test_dir, '--max-in-memory', '1', '--json', result_file] run_scan_click(args) scan_result = load_json_result(result_file) assert scan_result['headers'][0]['extra_data']['files_count'] == 2 @@ -251,6 +233,5 @@ def test_scancode_codebase_attempt_to_access_an_ignored_resourced_cached_to_disk u'user/ignore.doc', u'user/src', u'user/src/ignore.doc', - u'user/src/test', ] assert scan_locs == expected From 8e88b9426129d26fac2ec93106e502cb60a89f89 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 2 Jun 2026 18:50:21 +0530 Subject: [PATCH 2/6] Do not codebase walk outside input paths For multiple inputs, do not walk outside the input paths, from their common prefix. Instead create only the directory relationships between the common prefix and input paths and start the codebase walk from every input paths. Also deprecate --include options to only support ignoring paths through path patterns. Signed-off-by: Ayan Sinha Mahapatra --- src/commoncode/resource.py | 82 ++++++++++++++++++------- src/commoncode/testcase.py | 7 ++- src/scancode/cli.py | 41 +++++-------- tests/commoncode/test_resource.py | 2 +- tests/scancode/data/help/help_linux.txt | 1 - tests/scancode/test_cli.py | 13 +++- 6 files changed, 95 insertions(+), 51 deletions(-) diff --git a/src/commoncode/resource.py b/src/commoncode/resource.py index 19bfa955e36..16e17d4ca90 100644 --- a/src/commoncode/resource.py +++ b/src/commoncode/resource.py @@ -63,7 +63,7 @@ # Tracing flags TRACE = False -TRACE_DEEP = True +TRACE_DEEP = False def logger_debug(*args): @@ -110,7 +110,7 @@ def skip_ignored(location): return is_special(location) or ignored(location) -def is_ignored(location, includes=None, excludes=None): +def is_ignored(location, includes=tuple(), excludes=tuple()): excludes = { pattern: 'User ignore: Supplied by --ignore' for pattern in excludes @@ -339,8 +339,8 @@ def __init__( # finally populate self.paths = self._prepare_clean_paths(paths) + self.includes = self._prepare_clean_paths(includes) self.ignores = ignores - self.includes = includes self._populate() def _prepare_clean_paths(self, paths=tuple()): @@ -593,30 +593,48 @@ def err(_error): f"ERROR: cannot populate codebase: {_error}\n{traceback.format_exc()}" ) - skip_ignored = partial(is_ignored, includes=includes, excludes=ignores) + # ignore creating resources based on path patterns + skip_ignored = partial(is_ignored, excludes=ignores) if TRACE_DEEP: logger_debug(f"parents_by_loc: {parents_by_loc}, ignores: {ignores}, includes: {includes}") - # Walk over the directory and build the resource tree - for top, dirs, files in depth_walk( - root_location=root.location, - skip_ignored=skip_ignored, - max_depth=self.max_depth, - error_handler=err, - ): - parent = parents_by_loc.pop(top) - for created in self._create_resources( - parent=parent, - top=top, - dirs=dirs, - files=files, - skip_ignored=skip_ignored, + # in the case of a single input location, walking starts from + # the root and only the root location + if not includes: + includes = [root.location] + else: + # create the directory resources between the common + # prefix and the included locations so that they are + # connected to the root + for created in self._create_resources_common_prefix_to_inputs( + root=root, + includes=includes, ): - # on the plain, bare FS, files cannot be parents if not created.is_file: parents_by_loc[created.location] = created + # we start walking through all the input locations + for included_location in includes: + # Walk over the directory and build the resource tree + for top, dirs, files in depth_walk( + root_location=included_location, + skip_ignored=skip_ignored, + max_depth=self.max_depth, + error_handler=err, + ): + parent = parents_by_loc.pop(top) + for created in self._create_resources( + parent=parent, + top=top, + dirs=dirs, + files=files, + skip_ignored=skip_ignored, + ): + # on the plain, bare FS, files cannot be parents + if not created.is_file: + parents_by_loc[created.location] = created + def _create_resources(self, parent, top, dirs, files, skip_ignored=skip_ignored): """ Create and yield ``files`` and ``dirs`` children Resources of a @@ -641,6 +659,28 @@ def _create_resources(self, parent, top, dirs, files, skip_ignored=skip_ignored) logger_debug("Codebase.create_resources:", res) yield res + def _create_resources_common_prefix_to_inputs(self, root, includes): + + if TRACE_DEEP: + logger_debug(f"_create_resources_common_prefix_to_inputs: root:{root.location}, includes: {includes}") + + for included_path in includes: + _, _, extra_dir_path = included_path.rpartition(root.location) + extra_dirs = extra_dir_path.strip("/").split("/") + if TRACE_DEEP: + logger_debug(f"_create_resources_common_prefix_to_inputs: root:{root.location}, includes: {includes}") + + dir_resource = root + for dir_segment in extra_dirs: + dir_resource = self._get_or_create_resource( + name=dir_segment, + parent=dir_resource, + is_file=False, + ) + if TRACE: + logger_debug("Codebase.create_resources:", dir_resource) + yield dir_resource + def _create_root_resource(self): """ Create and return the root Resource of this codebase. @@ -1606,8 +1646,8 @@ def clean_path(path): Return a cleaned and normalized POSIX ``path``. """ path = path or "" - # convert to posix and ensure we have no slash at both ends - path = posixpath_normpath(path.replace("\\", "/").strip("/")) + # convert to posix and ensure we have no slash at the end + path = posixpath_normpath(path.replace("\\", "/").rstrip("/")) if path == ".": path = "" return path diff --git a/src/commoncode/testcase.py b/src/commoncode/testcase.py index 2a7b37a9923..3857e2e5f46 100644 --- a/src/commoncode/testcase.py +++ b/src/commoncode/testcase.py @@ -92,7 +92,7 @@ class FileDrivenTesting(object): test_data_dir = None - def get_test_loc(self, test_path, copy=False, debug=False, must_exist=True): + def get_test_loc(self, test_path, copy=False, debug=False, must_exist=True, relative=False): """ Given a `test_path` relative to the self.test_data_dir directory, return the location to a test file or directory for this path. Copy to a temp @@ -128,6 +128,11 @@ def get_test_loc(self, test_path, copy=False, debug=False, must_exist=True): # cleanup of VCS that could be left over from checkouts self.remove_vcs(target_dir) test_loc = target_dir + + if relative: + _, _, rel_test_loc = test_loc.rpartition(os.getcwd()) + return rel_test_loc.strip("/") + return test_loc def get_temp_file(self, extension=None, dir_name="td", file_name="tf"): diff --git a/src/scancode/cli.py b/src/scancode/cli.py index ca4d279eb0e..1e418e32f07 100644 --- a/src/scancode/cli.py +++ b/src/scancode/cli.py @@ -221,16 +221,6 @@ def default_processes(): callback=validate_input_path, type=click.Path(exists=True, readable=True, path_type=str)) -@click.option('--include', - multiple=True, - default=None, - metavar='', - help='Include files matching .', - sort_order=11, - help_group=cliutils.CORE_GROUP, - cls=PluggableCommandLineOption, -) - @click.option('--ignore', multiple=True, default=None, @@ -415,7 +405,6 @@ def default_processes(): def scancode( ctx, input, # NOQA - include, ignore, strip_root, full_root, @@ -527,7 +516,6 @@ def scancode( # run proper success, _results = run_scan( input=input, - include=include, ignore=ignore, from_json=from_json, strip_root=strip_root, @@ -570,7 +558,6 @@ def scancode( def run_scan( input, # - include=[], ignore=[], from_json=False, strip_root=False, @@ -623,6 +610,9 @@ def echo_func(*_args, **_kwargs): msg = 'At least one input path is required.' raise ScancodeError(msg) + # To support multiple path inputs + include = [] + if not isinstance(input, (list, tuple)): if not isinstance(input, str): msg = 'Unknown format: "{}".'.format(repr(input)) @@ -637,8 +627,6 @@ def echo_func(*_args, **_kwargs): # VirtualCodebase; otherwise we have to process `input` to make it a single # root with excludes. elif not from_json: - # FIXME: support the multiple root better. This is quirky at best - # This is the case where we have a list of input path and the # `from_json` option is not selected: we can handle this IFF they share # a common root directory and none is an absolute path @@ -650,30 +638,33 @@ def echo_func(*_args, **_kwargs): ) raise ScancodeError(msg) + abs_input = [os.path.abspath(i) for i in input] + # find the common prefix directory (note that this is a pre string # operation hence it may return non-existing paths - common_prefix = os.path.commonprefix(input) + common_prefix = os.path.commonprefix(abs_input) if not common_prefix: # we have no common prefix, but all relative. therefore the - # parent/root is the current ddirectory + # parent/root is the current directory common_prefix = str('.') + elif not common_prefix.endswith("/"): + # common prefix has trailing incomplete dirname + # for example the common prefix of "/temp/scancode" + # and "/temp/scans" is "/temp/scan" + common_prefix, _, _ = common_prefix.rpartition("/") elif not os.path.isdir(common_prefix): msg = ( 'Invalid inputs: all input paths must share a ' - 'common single parent directory.' + f'common single parent directory. common part: {common_prefix}' ) raise ScancodeError(msg) - # and we craft a list of synthetic --include path pattern options from - # the input list of paths - included_paths = [as_posixpath(path).rstrip('/') for path in input] - include.extend(included_paths) - - # ... and use the common prefix as our new input - # FIXME: we should not walk outside inputs + # and we craft a list of include paths where the codebase walks + # will start from, even though the root is the common prefix + include = [as_posixpath(path).rstrip('/') for path in abs_input] input = common_prefix # NOQA # build mappings of all options to pass down to plugins diff --git a/tests/commoncode/test_resource.py b/tests/commoncode/test_resource.py index 6249ebb435f..4b36bd92380 100644 --- a/tests/commoncode/test_resource.py +++ b/tests/commoncode/test_resource.py @@ -353,7 +353,7 @@ def test_get_resource_for_multiple_resource_codebase(self): codebase = Codebase(test_codebase) assert codebase.get_resource("resource/a").path == "resource/a" - assert codebase.get_resource("/resource/c").path == "resource/c" + assert codebase.get_resource("resource/c").path == "resource/c" assert codebase.get_resource("resource/dsasda/../b/").path == "resource/b" def test_Resource_build_path(self): diff --git a/tests/scancode/data/help/help_linux.txt b/tests/scancode/data/help/help_linux.txt index 855b7c7959d..2b917909f37 100644 --- a/tests/scancode/data/help/help_linux.txt +++ b/tests/scancode/data/help/help_linux.txt @@ -144,7 +144,6 @@ Options: -n, --processes INT Set the number of parallel processes to use. Disable parallel processing if 0. Also disable threading if -1. [default: (number of CPUs)-1] - --include Include files matching . -q, --quiet Do not print summary or progress. -v, --verbose Print progress as file-by-file path instead of a progress bar. Print verbose scan counters. diff --git a/tests/scancode/test_cli.py b/tests/scancode/test_cli.py index 9d038f71e62..0dca907efcf 100644 --- a/tests/scancode/test_cli.py +++ b/tests/scancode/test_cli.py @@ -168,7 +168,7 @@ def test_scan_info_returns_full_root(): file_paths = [f['path'] for f in result_data['files']] assert len(file_paths) == 12 # note that we strip paths from leading and trailing slashes - root = fileutils.as_posixpath(test_dir).strip('/') + root = fileutils.as_posixpath(test_dir) assert all(p.startswith(root) for p in file_paths) @@ -184,7 +184,7 @@ def test_scan_info_returns_correct_full_root_with_single_file(): scanned_file = files[0] # and we check that the path is the full path without repeating the file name # note that the path never contain leading and trailing slashes - assert scanned_file['path'] == fileutils.as_posixpath(test_file).strip('/') + assert scanned_file['path'] == fileutils.as_posixpath(test_file) def test_scan_info_returns_does_not_strip_root_with_single_file(): @@ -837,6 +837,15 @@ def test_scan_should_not_fail_with_low_max_in_memory_setting_when_ignoring_files run_scan_click(args, expected_rc=0) +def test_scan_supports_multiple_input_paths(): + test_file_1 = test_env.get_test_loc('summaries/client', relative=True) + test_file_2 = test_env.get_test_loc('summaries/counts', relative=True) + result_file = test_env.get_temp_file('json') + args = ['--info', '-n', '1', test_file_1, test_file_2, '--json', result_file] + run_scan_click(args, expected_rc=0) + + + def test_get_displayable_summary(): from scancode.cli import get_displayable_summary from commoncode.resource import Codebase From 43c5acfce6a73ee2864b5d5c023355f5a1fd8eef Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Thu, 11 Jun 2026 15:18:40 +0530 Subject: [PATCH 3/6] Bump commoncode version for release Signed-off-by: Ayan Sinha Mahapatra --- pyproject-commoncode.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject-commoncode.toml b/pyproject-commoncode.toml index 6c69ab439b3..10ab2d8631c 100644 --- a/pyproject-commoncode.toml +++ b/pyproject-commoncode.toml @@ -4,7 +4,7 @@ build-backend = "flot.buildapi" [project] name = "commoncode" -version = "32.4.2" +version = "32.5.0" authors = [ { name = "nexB. Inc. and others", email = "info@aboutcode.org" }, ] From 0445da5973b00dc7b05f4150b98d61f342028430 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Thu, 11 Jun 2026 15:37:51 +0530 Subject: [PATCH 4/6] Fix commoncode release scripts and bump version Signed-off-by: Ayan Sinha Mahapatra --- .github/workflows/commoncode-release.yml | 2 +- .../licensedcode-data-index-release.yml | 2 +- commoncode-CHANGELOG.rst | 19 +++++++++++++++++++ pyproject-commoncode.toml | 5 ++--- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/.github/workflows/commoncode-release.yml b/.github/workflows/commoncode-release.yml index 467e6ac234b..ca127d29656 100644 --- a/.github/workflows/commoncode-release.yml +++ b/.github/workflows/commoncode-release.yml @@ -1,4 +1,4 @@ -name: Create library release archives, create a GH release and publish PyPI wheel and sdist on tag in main branch +name: Create and release commoncode wheels on GitHub and Pypi # This is executed automatically on a tag in the main branch diff --git a/.github/workflows/licensedcode-data-index-release.yml b/.github/workflows/licensedcode-data-index-release.yml index ba267f89f5f..353829d1905 100644 --- a/.github/workflows/licensedcode-data-index-release.yml +++ b/.github/workflows/licensedcode-data-index-release.yml @@ -1,4 +1,4 @@ -name: Create library release archives, create a GH release and publish PyPI wheel and sdist on tag in main branch +name: Create and release licensedcode index & data wheels on GitHub and Pypi # This is executed automatically on a tag in the main branch diff --git a/commoncode-CHANGELOG.rst b/commoncode-CHANGELOG.rst index dc63866360b..7e3a2fb344f 100644 --- a/commoncode-CHANGELOG.rst +++ b/commoncode-CHANGELOG.rst @@ -1,6 +1,25 @@ Release notes ============= + +Version 32.5.1 - (2026-06-11) +----------------------------- + +- Minor fix in pyproject.toml to release wheels + to pypi properly. + +Version 32.5.0 - (2026-06-11) +----------------------------- + +- Merge commoncode back into scancode-toolkit + https://github.com/aboutcode-org/scancode-toolkit/pull/5116 + +- Add support to create codebase from multiple input paths by + starting codebase walk from these inputs and then ignoring + based on path patterns. Improves codebase and resource + collection and creation performance for multi-path scan inputs + https://github.com/aboutcode-org/scancode-toolkit/pull/5055 + Version 32.4.2 - (2025-01-08) ----------------------------- diff --git a/pyproject-commoncode.toml b/pyproject-commoncode.toml index 10ab2d8631c..92adbe61f1f 100644 --- a/pyproject-commoncode.toml +++ b/pyproject-commoncode.toml @@ -42,9 +42,6 @@ metadata_files = [ requires-python = ">=3.10" -[project.urls] -Homepage = "https://github.com/nexB/scancode-toolkit" - dependencies = [ "attrs >= 18.1,!=20.1.0;python_version<'3.11'", "attrs >= 22.1.0;python_version>='3.11'", @@ -55,6 +52,8 @@ dependencies = [ "text_unidecode >= 1.0" ] +[project.urls] +Homepage = "https://github.com/nexB/scancode-toolkit" [project.optional-dependencies] dev = [ From 72dd3d8d1a6e6b4dc17f04b2d589e36a95ea1cdf Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Thu, 11 Jun 2026 15:55:19 +0530 Subject: [PATCH 5/6] Bump version for commoncode v32.5.2 Signed-off-by: Ayan Sinha Mahapatra --- commoncode-CHANGELOG.rst | 4 ++++ pyproject-commoncode.toml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/commoncode-CHANGELOG.rst b/commoncode-CHANGELOG.rst index 7e3a2fb344f..2d56e748149 100644 --- a/commoncode-CHANGELOG.rst +++ b/commoncode-CHANGELOG.rst @@ -1,6 +1,10 @@ Release notes ============= +Version 32.5.2 - (2026-06-11) +----------------------------- + +- Bump version properly. Version 32.5.1 - (2026-06-11) ----------------------------- diff --git a/pyproject-commoncode.toml b/pyproject-commoncode.toml index 92adbe61f1f..3ea4920d29c 100644 --- a/pyproject-commoncode.toml +++ b/pyproject-commoncode.toml @@ -4,7 +4,7 @@ build-backend = "flot.buildapi" [project] name = "commoncode" -version = "32.5.0" +version = "32.5.2" authors = [ { name = "nexB. Inc. and others", email = "info@aboutcode.org" }, ] From bb148eadad2ea9eb0c60a4ef8759675365b8c06f Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Thu, 11 Jun 2026 17:08:56 +0530 Subject: [PATCH 6/6] Fix test failures Signed-off-by: Ayan Sinha Mahapatra --- src/commoncode/resource.py | 4 ++-- src/commoncode/testcase.py | 2 +- src/scancode/outdated.py | 12 ++++++++++-- src/scancode_config.py | 2 +- tests/scancode/data/help/help.txt | 3 +-- tests/scancode/test_outdated.py | 4 ++-- 6 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/commoncode/resource.py b/src/commoncode/resource.py index 378388a2dd3..5302f2f9d72 100644 --- a/src/commoncode/resource.py +++ b/src/commoncode/resource.py @@ -1667,8 +1667,8 @@ def strip_first_path_segment(path): '' >>> strip_first_path_segment('foo/bar/baz') 'bar/baz' - >>> strip_first_path_segment('/foo/bar/baz/') - 'bar/baz' + >>> strip_first_path_segment('/foo/bar/baz') + 'foo/bar/baz' >>> strip_first_path_segment('foo/') '' """ diff --git a/src/commoncode/testcase.py b/src/commoncode/testcase.py index 4feb7c645b1..ee680e24da2 100644 --- a/src/commoncode/testcase.py +++ b/src/commoncode/testcase.py @@ -132,7 +132,7 @@ def get_test_loc(self, test_path, copy=False, debug=False, must_exist=True, rela if relative: _, _, rel_test_loc = test_loc.rpartition(os.getcwd()) - return rel_test_loc.strip("/") + return rel_test_loc.strip("/").strip("\\") return test_loc diff --git a/src/scancode/outdated.py b/src/scancode/outdated.py index 4be850d8470..2c68dc39e2e 100644 --- a/src/scancode/outdated.py +++ b/src/scancode/outdated.py @@ -83,7 +83,11 @@ def total_seconds(td): class VersionCheckState: - def __init__(self): + def __init__(self, is_test=False): + if is_test: + self.state={} + return + self.statefile_path = os.path.join( scancode_cache_dir, 'scancode-version-check.json') self.lockfile_path = self.statefile_path + '.lockfile' @@ -135,6 +139,7 @@ def check_scancode_version( release_date=scancode_release_date, new_version_url='https://pypi.org/pypi/scancode-toolkit/json', force=False, + is_test=False, ): """ Check for an updated version of scancode-toolkit. Return a message to @@ -146,6 +151,7 @@ def check_scancode_version( installed_version=installed_version, new_version_url=new_version_url, force=force, + is_test=is_test, ) if newer_version: return build_outdated_message( @@ -159,6 +165,7 @@ def fetch_newer_version( installed_version=scancode_version, new_version_url='https://pypi.org/pypi/scancode-toolkit/json', force=False, + is_test=False, ): """ Return a version string if there is an updated version of scancode-toolkit @@ -175,9 +182,10 @@ def fetch_newer_version( try: installed_version = packaging_version.parse(installed_version) - state = VersionCheckState() + state = VersionCheckState(is_test=is_test) current_time = datetime.datetime.utcnow() + latest_version = None # Determine if we need to refresh the state if ('last_check' in state.state and 'latest_version' in state.state): last_check = datetime.datetime.strptime( diff --git a/src/scancode_config.py b/src/scancode_config.py index 20c57a19bef..6e9f634b083 100644 --- a/src/scancode_config.py +++ b/src/scancode_config.py @@ -95,7 +95,7 @@ def _create_dir(location): from subprocess import CalledProcessError # this may fail with exceptions - cmd = 'git', 'describe', '--tags', + cmd = 'git', 'describe', '--tags', '--match="v*"' try: output = check_output(cmd, stderr=STDOUT) __version__ = output.decode('utf-8').strip() diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt index e725888ead4..d65f1f00f45 100644 --- a/tests/scancode/data/help/help.txt +++ b/tests/scancode/data/help/help.txt @@ -92,8 +92,6 @@ Options: such that all paths have a common root directory. pre-scan: - --ignore Ignore files matching . - --include Include files matching . --facet = Add the to files with a path matching . @@ -138,6 +136,7 @@ Options: which are todo items and needs manual review. core: + --ignore Ignore files matching . --timeout Stop an unfinished file scan after a timeout in seconds. [default: 120 seconds] -n, --processes INT Set the number of parallel processes to use. Disable diff --git a/tests/scancode/test_outdated.py b/tests/scancode/test_outdated.py index cdac7853b2b..0509c6ea360 100644 --- a/tests/scancode/test_outdated.py +++ b/tests/scancode/test_outdated.py @@ -152,8 +152,8 @@ def jget(*args, **kwargs): json=jget, status_code=200 ) - assert not outdated.fetch_newer_version(force=True) - assert not outdated.check_scancode_version(force=True) + assert not outdated.fetch_newer_version(force=True, is_test=True) + assert not outdated.check_scancode_version(force=True, is_test=True) def test_fetch_newer_version_local_git_version():