Merge pull request #19424 from github/tausbn/python-extract-hidden-file-by-default

Python: Extract files in hidden dirs by default
2026-03-06 07:36:47 +01:00 · 2025-05-16 14:43:47 +02:00
parent cadcb202e2 9ee3e4cdf3
commit 579cf4a65a
15 changed files with 69 additions and 37 deletions
--- a/python/extractor/cli-integration-test/hidden-files/config.yml
+++ b/python/extractor/cli-integration-test/hidden-files/config.yml
@@ -0,0 +1,3 @@
+name: Test Config
+paths-ignore:
+  - "**/.*/**"
--- a/python/extractor/cli-integration-test/hidden-files/query-default.expected
+++ b/python/extractor/cli-integration-test/hidden-files/query-default.expected
@@ -0,0 +1,6 @@
+|             name              |
+-------------------------------+
+| .hidden_file.py               |
+| another_non_hidden.py         |
+| foo.py                        |
+| visible_file_in_hidden_dir.py |
--- a/python/extractor/cli-integration-test/hidden-files/query-skipped.expected
+++ b/python/extractor/cli-integration-test/hidden-files/query-skipped.expected
@@ -0,0 +1,4 @@
+|      name       |
+-----------------+
+| .hidden_file.py |
+| foo.py          |
--- a/python/extractor/cli-integration-test/hidden-files/query.ql
+++ b/python/extractor/cli-integration-test/hidden-files/query.ql
@@ -0,0 +1,3 @@
+import python
+
+select any(File f).getShortName() as name order by name
--- a/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/internal_non_hidden/another_non_hidden.py
+++ b/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/internal_non_hidden/another_non_hidden.py
--- a/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/visible_file_in_hidden_dir.py
+++ b/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/visible_file_in_hidden_dir.py
--- a/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_file.py
+++ b/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_file.py
--- a/python/extractor/cli-integration-test/hidden-files/repo_dir/foo.py
+++ b/python/extractor/cli-integration-test/hidden-files/repo_dir/foo.py
@@ -0,0 +1 @@
+print(42)
--- a/python/extractor/cli-integration-test/hidden-files/test.sh
+++ b/python/extractor/cli-integration-test/hidden-files/test.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+rm -rf db db-skipped
+
+# Test 1: Default behavior should be to extract files in hidden directories
+$CODEQL database create db --language python --source-root repo_dir/
+$CODEQL query run --database db query.ql > query-default.actual
+diff query-default.expected query-default.actual
+
+# Test 2: The default behavior can be overridden by setting `paths-ignore` in the config file
+$CODEQL database create db-skipped --language python --source-root repo_dir/ --codescanning-config=config.yml
+$CODEQL query run --database db-skipped query.ql > query-skipped.actual
+diff query-skipped.expected query-skipped.actual
+
+rm -rf db db-skipped
--- a/python/extractor/semmle/path_filters.py
+++ b/python/extractor/semmle/path_filters.py
@@ -41,6 +41,9 @@ def glob_part_to_regex(glob, add_sep):

 def glob_to_regex(glob, prefix=""):
    '''Convert entire glob to a compiled regex'''
+    # When the glob ends in `/`, we need to remember this so that we don't accidentally add an
+    # extra separator to the final regex.
+    end_sep = "" if glob.endswith("/") else SEP
    glob = glob.strip().strip("/")
    parts = glob.split("/")
    #Trailing '**' is redundant, so strip it off.
@@ -48,12 +51,17 @@ def glob_to_regex(glob, prefix=""):
        parts = parts[:-1]
        if not parts:
            return ".*"
+    # The `glob.strip("/")` call above will have removed all trailing slashes, but if there was at
+    # least one trailing slash, we want there to be an extra part, so we add it explicitly here in
+    # that case, using the emptyness of `end_sep` as a proxy.
+    if end_sep == "":
+        parts += [""]
    parts = [ glob_part_to_regex(escape(p), True) for p in parts[:-1] ] + [ glob_part_to_regex(escape(parts[-1]), False) ]
    # we need to escape the prefix, specifically because on windows the prefix will be
    # something like `C:\\folder\\subfolder\\` and without escaping the
    # backslash-path-separators will get interpreted as regex escapes (which might be
    # invalid sequences, causing the extractor to crash)
-    full_pattern = escape(prefix) + ''.join(parts) + "(?:" + SEP + ".*|$)"
+    full_pattern = escape(prefix) + ''.join(parts) + "(?:" + end_sep + ".*|$)"
    return re.compile(full_pattern)

 def filter_from_pattern(pattern, prev_filter, prefix):
--- a/python/extractor/semmle/traverser.py
+++ b/python/extractor/semmle/traverser.py
@@ -83,46 +83,21 @@ class Traverser(object):
                self.logger.debug("Ignoring %s (symlink)", fullpath)
                continue
            if isdir(fullpath):
-                if fullpath in self.exclude_paths or is_hidden(fullpath):
-                    if is_hidden(fullpath):
-                        self.logger.debug("Ignoring %s (hidden)", fullpath)
-                    else:
-                        self.logger.debug("Ignoring %s (excluded)", fullpath)
-                else:
-                    empty = True
-                    for item in self._treewalk(fullpath):
-                        yield item
-                        empty = False
-                    if not empty:
-                        yield fullpath
+                if fullpath in self.exclude_paths:
+                    self.logger.debug("Ignoring %s (excluded)", fullpath)
+                    continue
+
+                empty = True
+                for item in self._treewalk(fullpath):
+                    yield item
+                    empty = False
+                if not empty:
+                    yield fullpath
            elif self.filter(fullpath):
                yield fullpath
            else:
                self.logger.debug("Ignoring %s (filter)", fullpath)

-
-if os.name== 'nt':
-    import ctypes
-
-    def is_hidden(path):
-        #Magical windows code
-        try:
-            attrs = ctypes.windll.kernel32.GetFileAttributesW(str(path))
-            if attrs == -1:
-                return False
-            if attrs&2:
-                return True
-        except Exception:
-            #Not sure what to log here, probably best to carry on.
-            pass
-        return os.path.basename(path).startswith(".")
-
-else:
-
-    def is_hidden(path):
-        return os.path.basename(path).startswith(".")
-
-
 def exclude_filter_from_options(options):
    if options.exclude_package:
        choices = '|'.join(mod.replace('.', r'\.') for mod in options.exclude_package)
--- a/python/extractor/semmle/util.py
+++ b/python/extractor/semmle/util.py
@@ -10,7 +10,7 @@ from io import BytesIO

 #Semantic version of extractor.
 #Update this if any changes are made
-VERSION = "7.1.2"
+VERSION = "7.1.3"

 PY_EXTENSIONS = ".py", ".pyw"

--- a/python/ql/lib/change-notes/2025-04-30-extract-hidden-files-by-default.md
+++ b/python/ql/lib/change-notes/2025-04-30-extract-hidden-files-by-default.md
@@ -0,0 +1,5 @@
+---
+category: minorAnalysis
+---
+
+- The Python extractor now extracts files in hidden directories by default. If you would like to skip files in hidden directories, add `paths-ignore: ["**/.*/**"]` to your [Code Scanning config](https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning#specifying-directories-to-scan). If you would like to skip all hidden files, you can use `paths-ignore: ["**/.*"]`. When using the CodeQL CLI for extraction, specify the configuration (creating the configuration file if necessary) using the `--codescanning-config` option.
--- a/python/ql/test/2/extractor-tests/hidden/test.expected
+++ b/python/ql/test/2/extractor-tests/hidden/test.expected
@@ -1,3 +1,5 @@
+| .hidden/inner/test.py |
+| .hidden/module.py |
 | folder/module.py |
 | package |
 | package/__init__.py |
--- a/python/ql/test/extractor-tests/filter-option/Test.expected
+++ b/python/ql/test/extractor-tests/filter-option/Test.expected
@@ -3,3 +3,4 @@
 | Module foo.bar |
 | Module foo.include_test |
 | Package foo |
+| Script hidden_foo.py |