Python: Copy Python extractor to codeql repo

2025-12-16 16:53:25 +01:00 · 2024-02-28 15:15:21 +00:00
parent 297a17975d
commit 6dec323cfc
369 changed files with 165346 additions and 0 deletions
--- a/python/extractor/cli-integration-test/.gitignore
+++ b/python/extractor/cli-integration-test/.gitignore
@@ -0,0 +1,5 @@
+*/db/
+*/dbs/
+*/venv/
+**/*.egg-info/
+*/.cache
--- a/python/extractor/cli-integration-test/README.md
+++ b/python/extractor/cli-integration-test/README.md
@@ -0,0 +1,21 @@
+# Extractor Python CodeQL CLI integration tests
+
+To ensure that the two work together as intended, and as an easy way to set up realistic test-cases.
+
+
+### Adding a new test case
+
+Add a new folder, place a file called `test.sh` in it, which should start with the code below. The script should exit with failure code to fail the test.
+
+```bash
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+```
--- a/python/extractor/cli-integration-test/basic/query.ql
+++ b/python/extractor/cli-integration-test/basic/query.ql
@@ -0,0 +1 @@
+select 1
--- a/python/extractor/cli-integration-test/basic/repo_dir/foo.py
+++ b/python/extractor/cli-integration-test/basic/repo_dir/foo.py
@@ -0,0 +1 @@
+print(42)
--- a/python/extractor/cli-integration-test/basic/test.sh
+++ b/python/extractor/cli-integration-test/basic/test.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+rm -rf db
+
+$CODEQL database create db --language python --source-root repo_dir/
+$CODEQL query run --database db query.ql
--- a/python/extractor/cli-integration-test/disable-library-extraction/repo_dir/foo.py
+++ b/python/extractor/cli-integration-test/disable-library-extraction/repo_dir/foo.py
@@ -0,0 +1,3 @@
+import pip
+
+print(42)
--- a/python/extractor/cli-integration-test/disable-library-extraction/test.sh
+++ b/python/extractor/cli-integration-test/disable-library-extraction/test.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+# start on clean slate
+rm -rf dbs
+mkdir dbs
+
+cd "$SCRIPTDIR"
+
+# In 2.16.0 we will not extract libraries by default, so there is no difference in what
+# is extracted by setting this environment variable.. We should remove this test when
+# 2.17.0 is released.
+export CODEQL_EXTRACTOR_PYTHON_DISABLE_LIBRARY_EXTRACTION=
+$CODEQL database create dbs/normal --language python --source-root repo_dir/
+
+export CODEQL_EXTRACTOR_PYTHON_DISABLE_LIBRARY_EXTRACTION=1
+$CODEQL database create dbs/no-lib-extraction --language python --source-root repo_dir/
+
+# ---
+
+set +x
+
+EXTRACTED_NORMAL=$(unzip -l dbs/normal/src.zip | wc -l)
+EXTRACTED_NO_LIB_EXTRACTION=$(unzip -l dbs/no-lib-extraction/src.zip | wc -l)
+
+exitcode=0
+
+echo "EXTRACTED_NORMAL=$EXTRACTED_NORMAL"
+echo "EXTRACTED_NO_LIB_EXTRACTION=$EXTRACTED_NO_LIB_EXTRACTION"
+
+if [[ $EXTRACTED_NO_LIB_EXTRACTION -lt $EXTRACTED_NORMAL ]]; then
+    echo "ERROR: EXTRACTED_NO_LIB_EXTRACTION smaller than EXTRACTED_NORMAL"
+    exitcode=1
+fi
+
+exit $exitcode
--- a/python/extractor/cli-integration-test/extract-stdlib/query.ql
+++ b/python/extractor/cli-integration-test/extract-stdlib/query.ql
@@ -0,0 +1,18 @@
+import python
+import semmle.python.types.Builtins
+
+predicate named_entity(string name, string kind) {
+  exists(Builtin::special(name)) and kind = "special"
+  or
+  exists(Builtin::builtin(name)) and kind = "builtin"
+  or
+  exists(Module m | m.getName() = name) and kind = "module"
+  or
+  exists(File f | f.getShortName() = name + ".py") and kind = "file"
+}
+
+from string name, string kind
+where
+  name in ["foo", "baz", "main", "os", "sys", "re"] and
+  named_entity(name, kind)
+select name, kind order by name, kind
--- a/python/extractor/cli-integration-test/extract-stdlib/query.with-stdlib.expected
+++ b/python/extractor/cli-integration-test/extract-stdlib/query.with-stdlib.expected
@@ -0,0 +1,12 @@
+| name |  kind   |
+------+---------+
+| baz  | file    |
+| baz  | module  |
+| foo  | file    |
+| foo  | module  |
+| main | file    |
+| os   | file    |
+| os   | module  |
+| re   | file    |
+| re   | module  |
+| sys  | special |
--- a/python/extractor/cli-integration-test/extract-stdlib/query.without-stdlib.expected
+++ b/python/extractor/cli-integration-test/extract-stdlib/query.without-stdlib.expected
@@ -0,0 +1,8 @@
+| name |  kind   |
+------+---------+
+| baz  | file    |
+| baz  | module  |
+| foo  | file    |
+| foo  | module  |
+| main | file    |
+| sys  | special |
--- a/python/extractor/cli-integration-test/extract-stdlib/repo_dir/baz.py
+++ b/python/extractor/cli-integration-test/extract-stdlib/repo_dir/baz.py
@@ -0,0 +1 @@
+quux = 4
--- a/python/extractor/cli-integration-test/extract-stdlib/repo_dir/foo.py
+++ b/python/extractor/cli-integration-test/extract-stdlib/repo_dir/foo.py
@@ -0,0 +1,4 @@
+import baz
+import re
+bar = 5 + baz.quux
+re.compile("hello")
--- a/python/extractor/cli-integration-test/extract-stdlib/repo_dir/main.py
+++ b/python/extractor/cli-integration-test/extract-stdlib/repo_dir/main.py
@@ -0,0 +1,6 @@
+import sys
+import os
+print(os.path)
+print(sys.path)
+import foo
+print(foo.bar)
--- a/python/extractor/cli-integration-test/extract-stdlib/test.sh
+++ b/python/extractor/cli-integration-test/extract-stdlib/test.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+rm -rf dbs
+
+mkdir dbs
+
+CODEQL_EXTRACTOR_PYTHON_DONT_EXTRACT_STDLIB=True $CODEQL database create dbs/without-stdlib --language python --source-root repo_dir/
+$CODEQL query run --database dbs/without-stdlib query.ql > query.without-stdlib.actual
+diff query.without-stdlib.expected query.without-stdlib.actual
+
+LGTM_INDEX_EXCLUDE="/usr/lib/**" $CODEQL database create dbs/with-stdlib --language python --source-root repo_dir/
+$CODEQL query run --database dbs/with-stdlib query.ql > query.with-stdlib.actual
+diff query.with-stdlib.expected query.with-stdlib.actual
--- a/python/extractor/cli-integration-test/force-enable-library-extraction/repo_dir/foo.py
+++ b/python/extractor/cli-integration-test/force-enable-library-extraction/repo_dir/foo.py
@@ -0,0 +1,3 @@
+import pip
+
+print(42)
--- a/python/extractor/cli-integration-test/force-enable-library-extraction/test.sh
+++ b/python/extractor/cli-integration-test/force-enable-library-extraction/test.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+# start on clean slate
+rm -rf dbs
+mkdir dbs
+
+cd "$SCRIPTDIR"
+
+export CODEQL_EXTRACTOR_PYTHON_FORCE_ENABLE_LIBRARY_EXTRACTION_UNTIL_2_17_0=
+$CODEQL database create dbs/normal --language python --source-root repo_dir/
+
+export CODEQL_EXTRACTOR_PYTHON_FORCE_ENABLE_LIBRARY_EXTRACTION_UNTIL_2_17_0=1
+$CODEQL database create dbs/with-lib-extraction --language python --source-root repo_dir/
+
+# ---
+
+set +x
+
+EXTRACTED_NORMAL=$(unzip -l dbs/normal/src.zip | wc -l)
+EXTRACTED_WITH_LIB_EXTRACTION=$(unzip -l dbs/with-lib-extraction/src.zip | wc -l)
+
+exitcode=0
+
+echo "EXTRACTED_NORMAL=$EXTRACTED_NORMAL"
+echo "EXTRACTED_WITH_LIB_EXTRACTION=$EXTRACTED_WITH_LIB_EXTRACTION"
+
+if [[ ! $EXTRACTED_WITH_LIB_EXTRACTION -gt $EXTRACTED_NORMAL ]]; then
+    echo "ERROR: EXTRACTED_WITH_LIB_EXTRACTION not greater than EXTRACTED_NORMAL"
+    exitcode=1
+fi
+
+exit $exitcode
--- a/python/extractor/cli-integration-test/ignore-venv/.gitignore
+++ b/python/extractor/cli-integration-test/ignore-venv/.gitignore
@@ -0,0 +1,2 @@
+venv/
+venv2/
--- a/python/extractor/cli-integration-test/ignore-venv/repo_dir/foo.py
+++ b/python/extractor/cli-integration-test/ignore-venv/repo_dir/foo.py
@@ -0,0 +1,3 @@
+import flask
+
+print(42)
--- a/python/extractor/cli-integration-test/ignore-venv/test.sh
+++ b/python/extractor/cli-integration-test/ignore-venv/test.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+# start on clean slate
+rm -rf dbs repo_dir/venv*
+mkdir dbs
+
+
+# set up venvs
+cd repo_dir
+
+python3 -m venv venv
+venv/bin/pip install flask
+
+python3 -m venv venv2
+
+cd "$SCRIPTDIR"
+
+# In 2.16.0 we stop extracting libraries by default, so to test this functionality we
+# need to force enable it. Once we release 2.17.0 and turn off library extraction for
+# good, we can remove the part of this test ensuring that dependencies in an active
+# venv are still extracted (since that will no longer be the case).
+export CODEQL_EXTRACTOR_PYTHON_FORCE_ENABLE_LIBRARY_EXTRACTION_UNTIL_2_17_0=1
+
+# Create DBs with venv2 active (that does not have flask installed)
+source repo_dir/venv2/bin/activate
+
+export CODEQL_EXTRACTOR_PYTHON_DISABLE_AUTOMATIC_VENV_EXCLUDE=
+$CODEQL database create dbs/normal --language python --source-root repo_dir/
+
+export CODEQL_EXTRACTOR_PYTHON_DISABLE_AUTOMATIC_VENV_EXCLUDE=1
+$CODEQL database create dbs/no-venv-ignore --language python --source-root repo_dir/
+
+# Create DB with venv active that has flask installed. We want to ensure that we're
+# still able to resolve imports to flask, but don't want to extract EVERYTHING from
+# within the venv. Important note is that the test-file in the repo_dir actually imports
+# flask :D
+source repo_dir/venv/bin/activate
+export CODEQL_EXTRACTOR_PYTHON_DISABLE_AUTOMATIC_VENV_EXCLUDE=
+$CODEQL database create dbs/normal-with-flask-venv --language python --source-root repo_dir/
+
+# ---
+
+set +x
+
+EXTRACTED_NORMAL=$(unzip -l dbs/normal/src.zip | wc -l)
+EXTRACTED_NO_VENV_IGNORE=$(unzip -l dbs/no-venv-ignore/src.zip | wc -l)
+EXTRACTED_ACTIVE_FLASK=$(unzip -l dbs/normal-with-flask-venv/src.zip | wc -l)
+
+exitcode=0
+
+echo "EXTRACTED_NORMAL=$EXTRACTED_NORMAL"
+echo "EXTRACTED_NO_VENV_IGNORE=$EXTRACTED_NO_VENV_IGNORE"
+echo "EXTRACTED_ACTIVE_FLASK=$EXTRACTED_ACTIVE_FLASK"
+
+if [[ ! $EXTRACTED_NORMAL -lt $EXTRACTED_NO_VENV_IGNORE ]]; then
+    echo "ERROR: EXTRACTED_NORMAL not smaller EXTRACTED_NO_VENV_IGNORE"
+    exitcode=1
+fi
+
+if [[ ! $EXTRACTED_NORMAL -lt $EXTRACTED_ACTIVE_FLASK ]]; then
+    echo "ERROR: EXTRACTED_NORMAL not smaller EXTRACTED_ACTIVE_FLASK"
+    exitcode=1
+fi
+
+if [[ ! $EXTRACTED_ACTIVE_FLASK -lt $EXTRACTED_NO_VENV_IGNORE ]]; then
+    echo "ERROR: EXTRACTED_ACTIVE_FLASK not smaller EXTRACTED_NO_VENV_IGNORE"
+    exitcode=1
+fi
+
+exit $exitcode
--- a/python/extractor/cli-integration-test/pip-21.3-build-dir/.gitignore
+++ b/python/extractor/cli-integration-test/pip-21.3-build-dir/.gitignore
@@ -0,0 +1,2 @@
+repo_dir/build/
+dbs/
--- a/python/extractor/cli-integration-test/pip-21.3-build-dir/repo_dir/setup.py
+++ b/python/extractor/cli-integration-test/pip-21.3-build-dir/repo_dir/setup.py
@@ -0,0 +1,12 @@
+from setuptools import find_packages, setup
+
+# using src/ folder as recommended in: https://blog.ionelmc.ro/2014/05/25/python-packaging/
+
+setup(
+    name="example_pkg",
+    version="0.0.1",
+    description="example",
+    packages=find_packages("src"),
+    package_dir={"": "src"},
+    install_requires=[],
+)
--- a/python/extractor/cli-integration-test/pip-21.3-build-dir/repo_dir/src/example_pkg/init.py
+++ b/python/extractor/cli-integration-test/pip-21.3-build-dir/repo_dir/src/example_pkg/init.py
--- a/python/extractor/cli-integration-test/pip-21.3-build-dir/repo_dir/src/example_pkg/foo.py
+++ b/python/extractor/cli-integration-test/pip-21.3-build-dir/repo_dir/src/example_pkg/foo.py
@@ -0,0 +1 @@
+print(42)
--- a/python/extractor/cli-integration-test/pip-21.3-build-dir/test.sh
+++ b/python/extractor/cli-integration-test/pip-21.3-build-dir/test.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+NUM_PYTHON_FILES_IN_REPO=$(find repo_dir/src/ -name '*.py' | wc -l)
+
+rm -rf venv dbs
+
+mkdir dbs
+
+python3 -m venv venv
+
+source venv/bin/activate
+
+pip install --upgrade 'pip>=21.3'
+
+cd repo_dir
+pip install .
+cd "$SCRIPTDIR"
+
+export CODEQL_EXTRACTOR_PYTHON_DISABLE_AUTOMATIC_PIP_BUILD_DIR_EXCLUDE=
+$CODEQL database create dbs/normal --language python --source-root repo_dir/
+
+export CODEQL_EXTRACTOR_PYTHON_DISABLE_AUTOMATIC_PIP_BUILD_DIR_EXCLUDE=1
+$CODEQL database create dbs/with-build-dir --language python --source-root repo_dir/
+
+EXTRACTED_NORMAL=$(unzip -l dbs/normal/src.zip | wc -l)
+EXTRACTED_WITH_BUILD=$(unzip -l dbs/with-build-dir/src.zip | wc -l)
+
+if [[ $((EXTRACTED_NORMAL + NUM_PYTHON_FILES_IN_REPO)) == $EXTRACTED_WITH_BUILD ]]; then
+    echo "Numbers add up"
+else
+    echo "Numbers did not add up"
+    echo "NUM_PYTHON_FILES_IN_REPO=$NUM_PYTHON_FILES_IN_REPO"
+    echo "EXTRACTED_NORMAL=$EXTRACTED_NORMAL"
+    echo "EXTRACTED_WITH_BUILD=$EXTRACTED_WITH_BUILD"
+    exit 1
+fi
--- a/python/extractor/cli-integration-test/python-2-deprecation/query.only-python2.expected
+++ b/python/extractor/cli-integration-test/python-2-deprecation/query.only-python2.expected
@@ -0,0 +1,5 @@
+|   name   |
+----------+
+| dircache |
+| stat     |
+| test     |
--- a/python/extractor/cli-integration-test/python-2-deprecation/query.python2-using-python3.expected
+++ b/python/extractor/cli-integration-test/python-2-deprecation/query.python2-using-python3.expected
@@ -0,0 +1,5 @@
+|   name   |
+----------+
+| dircache |
+| stat     |
+| test     |
--- a/python/extractor/cli-integration-test/python-2-deprecation/query.ql
+++ b/python/extractor/cli-integration-test/python-2-deprecation/query.ql
@@ -0,0 +1,18 @@
+import python
+import semmle.python.types.Builtins
+
+predicate named_entity(string name, string kind) {
+  exists(Builtin::special(name)) and kind = "special"
+  or
+  exists(Builtin::builtin(name)) and kind = "builtin"
+  or
+  exists(Module m | m.getName() = name) and kind = "module"
+  or
+  exists(File f | f.getShortName() = name + ".py") and kind = "file"
+}
+
+from string name
+where
+  name in ["dircache", "test", "stat"] and
+  named_entity(name, "file")
+select name order by name
--- a/python/extractor/cli-integration-test/python-2-deprecation/query.without-python2.expected
+++ b/python/extractor/cli-integration-test/python-2-deprecation/query.without-python2.expected
@@ -0,0 +1,4 @@
+| name |
+------+
+| stat |
+| test |
--- a/python/extractor/cli-integration-test/python-2-deprecation/repo_dir/setup.py
+++ b/python/extractor/cli-integration-test/python-2-deprecation/repo_dir/setup.py
@@ -0,0 +1 @@
+"Programming Language :: Python :: 2"
--- a/python/extractor/cli-integration-test/python-2-deprecation/repo_dir/test.py
+++ b/python/extractor/cli-integration-test/python-2-deprecation/repo_dir/test.py
@@ -0,0 +1,5 @@
+# `dircache` was removed in Python 3, and so is a good test of which standard library we're
+# extracting.
+import dircache
+# A module that's present in both Python 2 and 3
+import stat
--- a/python/extractor/cli-integration-test/python-2-deprecation/test.sh
+++ b/python/extractor/cli-integration-test/python-2-deprecation/test.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+rm -rf dbs
+rm -f *.actual
+
+mkdir dbs
+
+# NB: on our Linux CI infrastructure, `python` is aliased to `python3`.
+WITHOUT_PYTHON2=$(pwd)/without-python2
+WITHOUT_PYTHON3=$(pwd)/without-python3
+
+echo "Test 1: Only Python 2 is available. Should fail."
+# Note the negation at the start of the command.
+! PATH="$WITHOUT_PYTHON3:$PATH" $CODEQL database create dbs/only-python2-no-flag --language python --source-root repo_dir/
+
+echo "Test 2: Only Python 3 is available. Should extract using Python 3 and use the Python 3 standard library."
+PATH="$WITHOUT_PYTHON2:$PATH" $CODEQL database create dbs/without-python2 --language python --source-root repo_dir/
+$CODEQL query run --database dbs/without-python2 query.ql > query.without-python2.actual
+diff query.without-python2.expected query.without-python2.actual
+
+echo "Test 3: Python 2 and 3 are both available. Should extract using Python 3, but use the Python 2 standard library."
+$CODEQL database create dbs/python2-using-python3 --language python --source-root repo_dir/
+$CODEQL query run --database dbs/python2-using-python3 query.ql > query.python2-using-python3.actual
+diff query.python2-using-python3.expected query.python2-using-python3.actual
+
+rm -f *.actual
--- a/python/extractor/cli-integration-test/python-2-deprecation/without-python2/python2
+++ b/python/extractor/cli-integration-test/python-2-deprecation/without-python2/python2
@@ -0,0 +1,4 @@
+echo "Attempted to run:"
+echo "  python2 $@"
+echo "Failing instead."
+exit 127
--- a/python/extractor/cli-integration-test/python-2-deprecation/without-python2/which
+++ b/python/extractor/cli-integration-test/python-2-deprecation/without-python2/which
@@ -0,0 +1,6 @@
+#!/bin/bash -p
+
+case $1 in
+    python2)   exit 1;;
+    *)         command /usr/bin/which -- "$1";;
+esac
--- a/python/extractor/cli-integration-test/python-2-deprecation/without-python3/python
+++ b/python/extractor/cli-integration-test/python-2-deprecation/without-python3/python
@@ -0,0 +1,4 @@
+echo "Attempted to run:"
+echo "  python $@"
+echo "Failing instead."
+exit 127
--- a/python/extractor/cli-integration-test/python-2-deprecation/without-python3/python3
+++ b/python/extractor/cli-integration-test/python-2-deprecation/without-python3/python3
@@ -0,0 +1,4 @@
+echo "Attempted to run:"
+echo "  python3 $@"
+echo "Failing instead."
+exit 127
--- a/python/extractor/cli-integration-test/python-2-deprecation/without-python3/which
+++ b/python/extractor/cli-integration-test/python-2-deprecation/without-python3/which
@@ -0,0 +1,9 @@
+#!/bin/bash -p
+
+echo "Fake which called with arguments: $@"
+
+case $1 in
+    python)    exit 1;;
+    python3)   exit 1;;
+    *)         command /usr/bin/which -- "$1";;
+esac
--- a/python/extractor/cli-integration-test/run-all-tests.sh
+++ b/python/extractor/cli-integration-test/run-all-tests.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+failures=()
+for f in */test.sh; do
+    echo "Running $f:"
+    if ! bash "$f"; then
+        echo "ERROR: $f failed"
+        failures+=("$f")
+    fi
+    echo "---"
+done
+
+if [ -z "${failures[*]}" ]; then
+    echo "All integration tests passed!"
+    exit 0
+else
+    echo "ERROR: Some integration test failed! Failures:"
+    for failure in "${failures[@]}"
+    do
+        echo "- ${failure}"
+    done
+    exit 1
+fi
--- a/python/extractor/cli-integration-test/stdout-encoding/repo_dir/ನನ್ನ_ಸ್ಕ್ರಿಪ್ಟ್.py
+++ b/python/extractor/cli-integration-test/stdout-encoding/repo_dir/ನನ್ನ_ಸ್ಕ್ರಿಪ್ಟ್.py
@@ -0,0 +1 @@
+print(42)
--- a/python/extractor/cli-integration-test/stdout-encoding/test.sh
+++ b/python/extractor/cli-integration-test/stdout-encoding/test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+rm -rf db
+
+# even with default encoding that doesn't support utf-8 (like on windows) we want to
+# ensure that we can properly log that we've extracted files whose filenames contain
+# utf-8 chars
+export PYTHONIOENCODING="ascii"
+$CODEQL database create db --language python --source-root repo_dir/
--- a/python/extractor/cli-integration-test/symlinks/.gitignore
+++ b/python/extractor/cli-integration-test/symlinks/.gitignore
@@ -0,0 +1,2 @@
+repo_dir/subdir
+repo_dir/symlink_to_top
--- a/python/extractor/cli-integration-test/symlinks/query.ql
+++ b/python/extractor/cli-integration-test/symlinks/query.ql
@@ -0,0 +1 @@
+select 1
--- a/python/extractor/cli-integration-test/symlinks/repo_dir/foo.py
+++ b/python/extractor/cli-integration-test/symlinks/repo_dir/foo.py
@@ -0,0 +1 @@
+print(42)
--- a/python/extractor/cli-integration-test/symlinks/test.sh
+++ b/python/extractor/cli-integration-test/symlinks/test.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+rm -rf db
+
+# create two symlink loops, so
+# - repo_dir/subdir/symlink_to_top -> repo_dir
+# - repo_dir/symlink_to_top -> repo_dir
+# such a setup was seen in https://github.com/PowerDNS/weakforced
+
+rm -rf repo_dir/subdir
+mkdir repo_dir/subdir
+ln -s .. repo_dir/subdir/symlink_to_top
+
+rm -f repo_dir/symlink_to_top
+ln -s . repo_dir/symlink_to_top
+
+timeout --verbose 15s $CODEQL database create db --language python --source-root repo_dir/
+$CODEQL query run --database db query.ql
--- a/python/extractor/cli-integration-test/writing-diagnostics/diagnostics.expected
+++ b/python/extractor/cli-integration-test/writing-diagnostics/diagnostics.expected
@@ -0,0 +1,163 @@
+{
+  "attributes": {
+    "args": [
+      "Syntax Error"
+    ],
+    "traceback": [
+      "\"semmle/python/modules.py\", line 108, in py_ast",
+      "\"semmle/python/modules.py\", line 102, in old_py_ast",
+      "\"semmle/python/parser/__init__.py\", line 100, in parse",
+      "\"semmleFile \"<string>\", line 1",
+      "\"semmle/python/extractor.py\", line 84, in process_source_module",
+      "\"semmle/python/modules.py\", line 92, in ast",
+      "\"semmle/python/modules.py\", line 120, in py_ast",
+      "\"semmle/python/modules.py\", line 117, in py_ast",
+      "\"semmle/python/parser/tsg_parser.py\", line 221, in parse",
+      "\"semmleFile \"<string>\", line 1"
+    ]
+  },
+  "location": {
+    "file": "<test-root-directory>/repo_dir/syntaxerror3.py",
+    "startColumn": 0,
+    "endColumn": 0,
+    "startLine": 1,
+    "endLine": 1
+  },
+  "markdownMessage": "A parse error occurred while processing `<test-root-directory>/repo_dir/syntaxerror3.py`, and as a result this file could not be analyzed. Check the syntax of the file using the `python -m py_compile` command and correct any invalid syntax.",
+  "severity": "warning",
+  "source": {
+    "extractorName": "python",
+    "id": "py/diagnostics/syntax-error",
+    "name": "Could not process some files due to syntax errors"
+  },
+  "timestamp": "2023-03-13T15:03:48.177832",
+  "visibility": {
+    "cliSummaryTable": true,
+    "statusPage": true,
+    "telemetry": true
+  }
+}
+{
+  "attributes": {
+    "args": [
+      "Syntax Error"
+    ],
+    "traceback": [
+      "\"semmle/python/modules.py\", line 108, in py_ast",
+      "\"semmle/python/modules.py\", line 102, in old_py_ast",
+      "\"semmle/python/parser/__init__.py\", line 100, in parse",
+      "\"semmleFile \"<string>\", line 3",
+      "\"semmle/python/extractor.py\", line 84, in process_source_module",
+      "\"semmle/python/modules.py\", line 92, in ast",
+      "\"semmle/python/modules.py\", line 120, in py_ast",
+      "\"semmle/python/modules.py\", line 117, in py_ast",
+      "\"semmle/python/parser/tsg_parser.py\", line 221, in parse",
+      "\"semmleFile \"<string>\", line 3"
+    ]
+  },
+  "location": {
+    "file": "<test-root-directory>/repo_dir/syntaxerror1.py",
+    "startColumn": 0,
+    "endColumn": 0,
+    "startLine": 3,
+    "endLine": 3
+  },
+  "markdownMessage": "A parse error occurred while processing `<test-root-directory>/repo_dir/syntaxerror1.py`, and as a result this file could not be analyzed. Check the syntax of the file using the `python -m py_compile` command and correct any invalid syntax.",
+  "severity": "warning",
+  "source": {
+    "extractorName": "python",
+    "id": "py/diagnostics/syntax-error",
+    "name": "Could not process some files due to syntax errors"
+  },
+  "timestamp": "2023-03-13T15:03:48.181384",
+  "visibility": {
+    "cliSummaryTable": true,
+    "statusPage": true,
+    "telemetry": true
+  }
+}
+{
+  "attributes": {
+    "args": [
+      "Syntax Error"
+    ],
+    "traceback": [
+      "\"semmle/python/modules.py\", line 108, in py_ast",
+      "\"semmle/python/modules.py\", line 102, in old_py_ast",
+      "\"semmle/python/parser/__init__.py\", line 100, in parse",
+      "\"semmleFile \"<string>\", line 6",
+      "\"semmle/python/extractor.py\", line 84, in process_source_module",
+      "\"semmle/python/modules.py\", line 92, in ast",
+      "\"semmle/python/modules.py\", line 120, in py_ast",
+      "\"semmle/python/modules.py\", line 117, in py_ast",
+      "\"semmle/python/parser/tsg_parser.py\", line 221, in parse",
+      "\"semmleFile \"<string>\", line 5"
+    ]
+  },
+  "location": {
+    "file": "<test-root-directory>/repo_dir/syntaxerror2.py",
+    "startColumn": 0,
+    "endColumn": 0,
+    "startLine": 5,
+    "endLine": 5
+  },
+  "markdownMessage": "A parse error occurred while processing `<test-root-directory>/repo_dir/syntaxerror2.py`, and as a result this file could not be analyzed. Check the syntax of the file using the `python -m py_compile` command and correct any invalid syntax.",
+  "severity": "warning",
+  "source": {
+    "extractorName": "python",
+    "id": "py/diagnostics/syntax-error",
+    "name": "Could not process some files due to syntax errors"
+  },
+  "timestamp": "2023-03-13T15:03:48.164991",
+  "visibility": {
+    "cliSummaryTable": true,
+    "statusPage": true,
+    "telemetry": true
+  }
+}
+{
+  "attributes": {
+    "args": [
+      "maximum recursion depth exceeded while calling a Python object"
+    ],
+    "traceback": [
+      "\"semmle/worker.py\", line 235, in _extract_loop",
+      "\"semmle/extractors/super_extractor.py\", line 37, in process",
+      "\"semmle/extractors/py_extractor.py\", line 43, in process",
+      "\"semmle/python/extractor.py\", line 227, in process_source_module",
+      "\"semmle/python/extractor.py\", line 84, in process_source_module",
+      "\"semmle/python/modules.py\", line 96, in ast",
+      "\"semmle/python/passes/labeller.py\", line 85, in apply",
+      "\"semmle/python/passes/labeller.py\", line 44, in __init__",
+      "\"semmle/python/passes/labeller.py\", line 14, in __init__",
+      "\"semmle/python/passes/ast_pass.py\", line 208, in visit",
+      "\"semmle/python/passes/ast_pass.py\", line 216, in generic_visit",
+      "\"semmle/python/passes/ast_pass.py\", line 213, in generic_visit",
+      "\"semmle/python/passes/ast_pass.py\", line 208, in visit",
+      "\"semmle/python/passes/ast_pass.py\", line 213, in generic_visit",
+      "\"semmle/python/passes/ast_pass.py\", line 208, in visit",
+      "... 3930 lines skipped",
+      "\"semmle/python/passes/ast_pass.py\", line 213, in generic_visit",
+      "\"semmle/python/passes/ast_pass.py\", line 208, in visit",
+      "\"semmle/python/passes/ast_pass.py\", line 213, in generic_visit",
+      "\"semmle/python/passes/ast_pass.py\", line 208, in visit",
+      "\"semmle/python/passes/ast_pass.py\", line 205, in _get_visit_method"
+    ]
+  },
+  "location": {
+    "file": "<test-root-directory>/repo_dir/recursion_error.py"
+  },
+  "plaintextMessage": "maximum recursion depth exceeded while calling a Python object",
+  "severity": "error",
+  "source": {
+    "extractorName": "python",
+    "id": "py/diagnostics/recursion-error",
+    "name": "Recursion error in Python extractor"
+  },
+  "timestamp": "2023-03-13T15:03:47.468924",
+  "visibility": {
+    "cliSummaryTable": false,
+    "statusPage": false,
+    "telemetry": true
+  }
+}
--- a/python/extractor/cli-integration-test/writing-diagnostics/make_test.py
+++ b/python/extractor/cli-integration-test/writing-diagnostics/make_test.py
@@ -0,0 +1,4 @@
+
+# Creates a test file that will cause a RecursionError when run with the Python extractor.
+with open('repo_dir/recursion_error.py', 'w') as f:
+    f.write("print({})\n".format("+".join(["1"] * 1000)))
--- a/python/extractor/cli-integration-test/writing-diagnostics/query.expected
+++ b/python/extractor/cli-integration-test/writing-diagnostics/query.expected
@@ -0,0 +1,6 @@
+|    filename     |
+-----------------+
+| safe.py         |
+| syntaxerror1.py |
+| syntaxerror2.py |
+| syntaxerror3.py |
--- a/python/extractor/cli-integration-test/writing-diagnostics/query.ql
+++ b/python/extractor/cli-integration-test/writing-diagnostics/query.ql
@@ -0,0 +1,3 @@
+import python
+
+select any(File f).getShortName() as filename order by filename
--- a/python/extractor/cli-integration-test/writing-diagnostics/repo_dir/safe.py
+++ b/python/extractor/cli-integration-test/writing-diagnostics/repo_dir/safe.py
@@ -0,0 +1 @@
+print("No deeply nested structures here!")
--- a/python/extractor/cli-integration-test/writing-diagnostics/repo_dir/syntaxerror1.py
+++ b/python/extractor/cli-integration-test/writing-diagnostics/repo_dir/syntaxerror1.py
@@ -0,0 +1,3 @@
+# This file contains a deliberate syntax error
+
+2 +
--- a/python/extractor/cli-integration-test/writing-diagnostics/repo_dir/syntaxerror2.py
+++ b/python/extractor/cli-integration-test/writing-diagnostics/repo_dir/syntaxerror2.py
@@ -0,0 +1,5 @@
+
+
+
+
+[
--- a/python/extractor/cli-integration-test/writing-diagnostics/repo_dir/syntaxerror3.py
+++ b/python/extractor/cli-integration-test/writing-diagnostics/repo_dir/syntaxerror3.py
@@ -0,0 +1 @@
+"Oh no!
--- a/python/extractor/cli-integration-test/writing-diagnostics/test.sh
+++ b/python/extractor/cli-integration-test/writing-diagnostics/test.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+rm -rf db
+rm -f *.actual
+
+python3 make_test.py
+
+echo "Testing database with various errors during extraction"
+$CODEQL database create db --language python --source-root repo_dir/
+$CODEQL query run --database db query.ql > query.actual
+diff query.expected query.actual
+python3 test_diagnostics_output.py
+
+rm -f *.actual
+rm -f repo_dir/recursion_error.py
+rm -rf db
--- a/python/extractor/cli-integration-test/writing-diagnostics/test_diagnostics_output.py
+++ b/python/extractor/cli-integration-test/writing-diagnostics/test_diagnostics_output.py
@@ -0,0 +1,7 @@
+import os
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "..", "integration-tests"))
+import diagnostics_test_utils
+
+test_db = "db"
+diagnostics_test_utils.check_diagnostics(".", test_db, skip_attributes=True)