Merge pull request #19424 from github/tausbn/python-extract-hidden-file-by-default

Python: Extract files in hidden dirs by default
This commit is contained in:
Taus
2025-05-16 14:43:47 +02:00
committed by GitHub
15 changed files with 69 additions and 37 deletions

View File

@@ -0,0 +1,3 @@
name: Test Config
paths-ignore:
- "**/.*/**"

View File

@@ -0,0 +1,6 @@
| name |
+-------------------------------+
| .hidden_file.py |
| another_non_hidden.py |
| foo.py |
| visible_file_in_hidden_dir.py |

View File

@@ -0,0 +1,4 @@
| name |
+-----------------+
| .hidden_file.py |
| foo.py |

View File

@@ -0,0 +1,3 @@
import python
select any(File f).getShortName() as name order by name

View File

@@ -0,0 +1 @@
print(42)

View File

@@ -0,0 +1,24 @@
#!/bin/bash
set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
set -x
CODEQL=${CODEQL:-codeql}
SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd "$SCRIPTDIR"
rm -rf db db-skipped
# Test 1: Default behavior should be to extract files in hidden directories
$CODEQL database create db --language python --source-root repo_dir/
$CODEQL query run --database db query.ql > query-default.actual
diff query-default.expected query-default.actual
# Test 2: The default behavior can be overridden by setting `paths-ignore` in the config file
$CODEQL database create db-skipped --language python --source-root repo_dir/ --codescanning-config=config.yml
$CODEQL query run --database db-skipped query.ql > query-skipped.actual
diff query-skipped.expected query-skipped.actual
rm -rf db db-skipped

View File

@@ -41,6 +41,9 @@ def glob_part_to_regex(glob, add_sep):
def glob_to_regex(glob, prefix=""):
'''Convert entire glob to a compiled regex'''
# When the glob ends in `/`, we need to remember this so that we don't accidentally add an
# extra separator to the final regex.
end_sep = "" if glob.endswith("/") else SEP
glob = glob.strip().strip("/")
parts = glob.split("/")
#Trailing '**' is redundant, so strip it off.
@@ -48,12 +51,17 @@ def glob_to_regex(glob, prefix=""):
parts = parts[:-1]
if not parts:
return ".*"
# The `glob.strip("/")` call above will have removed all trailing slashes, but if there was at
# least one trailing slash, we want there to be an extra part, so we add it explicitly here in
# that case, using the emptyness of `end_sep` as a proxy.
if end_sep == "":
parts += [""]
parts = [ glob_part_to_regex(escape(p), True) for p in parts[:-1] ] + [ glob_part_to_regex(escape(parts[-1]), False) ]
# we need to escape the prefix, specifically because on windows the prefix will be
# something like `C:\\folder\\subfolder\\` and without escaping the
# backslash-path-separators will get interpreted as regex escapes (which might be
# invalid sequences, causing the extractor to crash)
full_pattern = escape(prefix) + ''.join(parts) + "(?:" + SEP + ".*|$)"
full_pattern = escape(prefix) + ''.join(parts) + "(?:" + end_sep + ".*|$)"
return re.compile(full_pattern)
def filter_from_pattern(pattern, prev_filter, prefix):

View File

@@ -83,46 +83,21 @@ class Traverser(object):
self.logger.debug("Ignoring %s (symlink)", fullpath)
continue
if isdir(fullpath):
if fullpath in self.exclude_paths or is_hidden(fullpath):
if is_hidden(fullpath):
self.logger.debug("Ignoring %s (hidden)", fullpath)
else:
self.logger.debug("Ignoring %s (excluded)", fullpath)
else:
empty = True
for item in self._treewalk(fullpath):
yield item
empty = False
if not empty:
yield fullpath
if fullpath in self.exclude_paths:
self.logger.debug("Ignoring %s (excluded)", fullpath)
continue
empty = True
for item in self._treewalk(fullpath):
yield item
empty = False
if not empty:
yield fullpath
elif self.filter(fullpath):
yield fullpath
else:
self.logger.debug("Ignoring %s (filter)", fullpath)
if os.name== 'nt':
import ctypes
def is_hidden(path):
#Magical windows code
try:
attrs = ctypes.windll.kernel32.GetFileAttributesW(str(path))
if attrs == -1:
return False
if attrs&2:
return True
except Exception:
#Not sure what to log here, probably best to carry on.
pass
return os.path.basename(path).startswith(".")
else:
def is_hidden(path):
return os.path.basename(path).startswith(".")
def exclude_filter_from_options(options):
if options.exclude_package:
choices = '|'.join(mod.replace('.', r'\.') for mod in options.exclude_package)

View File

@@ -10,7 +10,7 @@ from io import BytesIO
#Semantic version of extractor.
#Update this if any changes are made
VERSION = "7.1.2"
VERSION = "7.1.3"
PY_EXTENSIONS = ".py", ".pyw"

View File

@@ -0,0 +1,5 @@
---
category: minorAnalysis
---
- The Python extractor now extracts files in hidden directories by default. If you would like to skip files in hidden directories, add `paths-ignore: ["**/.*/**"]` to your [Code Scanning config](https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning#specifying-directories-to-scan). If you would like to skip all hidden files, you can use `paths-ignore: ["**/.*"]`. When using the CodeQL CLI for extraction, specify the configuration (creating the configuration file if necessary) using the `--codescanning-config` option.

View File

@@ -1,3 +1,5 @@
| .hidden/inner/test.py |
| .hidden/module.py |
| folder/module.py |
| package |
| package/__init__.py |

View File

@@ -3,3 +3,4 @@
| Module foo.bar |
| Module foo.include_test |
| Package foo |
| Script hidden_foo.py |