Merge pull request #5832 from tamasvajk/feature/csv-coverage-report

Java: github action for CSV coverage report
2026-07-21 11:18:20 +02:00 · 2021-05-25 14:51:19 +02:00
parent d05f524759 70b3066bb8
commit 1997f500c2
8 changed files with 568 additions and 0 deletions
--- a/.github/workflows/csv-coverage.yml
+++ b/.github/workflows/csv-coverage.yml
@@ -0,0 +1,77 @@
+name: Build/check CSV flow coverage report
+
+on:
+  workflow_dispatch:
+    inputs:
+      qlModelShaOverride:
+        description: 'github/codeql repo SHA used for looking up the CSV models'
+        required: false
+  push:
+    branches:
+     - main
+     - 'rc/**'
+  pull_request:
+    paths:
+      - '.github/workflows/csv-coverage.yml'
+      - '*/ql/src/**/*.ql'
+      - '*/ql/src/**/*.qll'
+      - 'misc/scripts/library-coverage/*.py'
+      # input data files
+      - '*/documentation/library-coverage/cwe-sink.csv'
+      - '*/documentation/library-coverage/frameworks.csv'
+      # coverage report files
+      - '*/documentation/library-coverage/flow-model-coverage.csv'
+      - '*/documentation/library-coverage/flow-model-coverage.rst'
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Clone self (github/codeql)
+      uses: actions/checkout@v2
+      with:
+        path: script
+    - name: Clone self (github/codeql) at a given SHA for analysis
+      if: github.event.inputs.qlModelShaOverride != ''
+      uses: actions/checkout@v2
+      with:
+        path: codeqlModels
+        ref: github.event.inputs.qlModelShaOverride
+    - name: Clone self (github/codeql) for analysis
+      if: github.event.inputs.qlModelShaOverride == ''
+      uses: actions/checkout@v2
+      with:
+        path: codeqlModels
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.8
+    - name: Download CodeQL CLI
+      uses: dsaltares/fetch-gh-release-asset@aa37ae5c44d3c9820bc12fe675e8670ecd93bd1c
+      with:
+        repo: "github/codeql-cli-binaries"
+        version: "latest"
+        file: "codeql-linux64.zip"
+        token: ${{ secrets.GITHUB_TOKEN }}
+    - name: Unzip CodeQL CLI
+      run: unzip -d codeql-cli codeql-linux64.zip
+    - name: Build modeled package list
+      run: |
+        PATH="$PATH:codeql-cli/codeql" python script/misc/scripts/library-coverage/generate-report.py ci codeqlModels script
+    - name: Upload CSV package list
+      uses: actions/upload-artifact@v2
+      with:
+        name: csv-flow-model-coverage
+        path: flow-model-coverage-*.csv
+    - name: Upload RST package list
+      uses: actions/upload-artifact@v2
+      with:
+        name: rst-flow-model-coverage
+        path: flow-model-coverage-*.rst
+    - name: Check coverage files
+      if: github.event.pull_request
+      run: |
+        python script/misc/scripts/library-coverage/compare-files.py codeqlModels
+
--- a/java/documentation/library-coverage/cwe-sink.csv
+++ b/java/documentation/library-coverage/cwe-sink.csv
@@ -0,0 +1,8 @@
+CWE,Sink identifier,Label
+CWE‑089,sql,SQL injection
+CWE‑022,create-file,Path injection
+CWE‑036,url-open-stream,Path traversal
+CWE‑094,bean-validation,Code injection
+CWE‑319,open-url,Cleartext transmission
+CWE‑079,xss,Cross-site scripting
+CWE‑090,ldap,LDAP injection
--- a/java/documentation/library-coverage/flow-model-coverage.csv
+++ b/java/documentation/library-coverage/flow-model-coverage.csv
@@ -0,0 +1,42 @@
+package,sink,source,summary,sink:bean-validation,sink:create-file,sink:header-splitting,sink:ldap,sink:open-url,sink:set-hostname-verifier,sink:url-open-stream,sink:xpath,sink:xss,source:remote,summary:taint,summary:value
+android.util,,16,,,,,,,,,,,16,,
+android.webkit,3,2,,,,,,,,,,3,2,,
+com.esotericsoftware.kryo.io,,,1,,,,,,,,,,,1,
+com.esotericsoftware.kryo5.io,,,1,,,,,,,,,,,1,
+com.fasterxml.jackson.databind,,,2,,,,,,,,,,,2,
+com.google.common.base,,,28,,,,,,,,,,,22,6
+com.google.common.io,6,,69,,,,,,,6,,,,68,1
+com.unboundid.ldap.sdk,17,,,,,,17,,,,,,,,
+java.beans,,,1,,,,,,,,,,,1,
+java.io,3,,20,,3,,,,,,,,,20,
+java.lang,,,1,,,,,,,,,,,1,
+java.net,2,3,4,,,,,2,,,,,3,4,
+java.nio,10,,2,,10,,,,,,,,,2,
+java.util,,,13,,,,,,,,,,,13,
+javax.naming.directory,1,,,,,,1,,,,,,,,
+javax.net.ssl,2,,,,,,,,2,,,,,,
+javax.servlet,4,21,2,,,3,,,,,,1,21,2,
+javax.validation,1,1,,1,,,,,,,,,1,,
+javax.ws.rs.core,1,,,,,1,,,,,,,,,
+javax.xml.transform.sax,,,4,,,,,,,,,,,4,
+javax.xml.transform.stream,,,2,,,,,,,,,,,2,
+javax.xml.xpath,3,,,,,,,,,,3,,,,
+org.apache.commons.codec,,,2,,,,,,,,,,,2,
+org.apache.commons.io,,,22,,,,,,,,,,,22,
+org.apache.commons.lang3,,,313,,,,,,,,,,,299,14
+org.apache.commons.text,,,203,,,,,,,,,,,203,
+org.apache.directory.ldap.client.api,1,,,,,,1,,,,,,,,
+org.apache.hc.core5.function,,,1,,,,,,,,,,,1,
+org.apache.hc.core5.http,1,2,39,,,,,,,,,1,2,39,
+org.apache.hc.core5.net,,,2,,,,,,,,,,,2,
+org.apache.hc.core5.util,,,22,,,,,,,,,,,18,4
+org.apache.http,2,3,66,,,,,,,,,2,3,59,7
+org.dom4j,20,,,,,,,,,,20,,,,
+org.springframework.ldap.core,14,,,,,,14,,,,,,,,
+org.springframework.security.web.savedrequest,,6,,,,,,,,,,,6,,
+org.springframework.web.client,,3,,,,,,,,,,,3,,
+org.springframework.web.context.request,,8,,,,,,,,,,,8,,
+org.springframework.web.multipart,,12,,,,,,,,,,,12,,
+org.xml.sax,,,1,,,,,,,,,,,1,
+org.xmlpull.v1,,3,,,,,,,,,,,3,,
+play.mvc,,4,,,,,,,,,,,4,,
--- a/java/documentation/library-coverage/flow-model-coverage.rst
+++ b/java/documentation/library-coverage/flow-model-coverage.rst
@@ -0,0 +1,19 @@
+Java framework & library support
+================================
+
+.. csv-table::
+   :header-rows: 1
+   :class: fullWidthTable
+   :widths: auto
+
+   Framework / library,Package,Remote flow sources,Taint & value steps,Sinks (total),`CWE‑022` :sub:`Path injection`,`CWE‑036` :sub:`Path traversal`,`CWE‑079` :sub:`Cross-site scripting`,`CWE‑089` :sub:`SQL injection`,`CWE‑090` :sub:`LDAP injection`,`CWE‑094` :sub:`Code injection`,`CWE‑319` :sub:`Cleartext transmission`
+   Android,``android.*``,18,,3,,,3,,,,
+   Apache,``org.apache.*``,5,648,4,,,3,,1,,
+   `Apache Commons IO <https://commons.apache.org/proper/commons-io/>`_,``org.apache.commons.io``,,22,,,,,,,,
+   Google,``com.google.common.*``,,97,6,,6,,,,,
+   Java Standard Library,``java.*``,3,41,15,13,,,,,,2
+   Java extensions,``javax.*``,22,8,12,,,1,,1,1,
+   `Spring <https://spring.io/>`_,``org.springframework.*``,29,,14,,,,,14,,
+   Others,"``com.esotericsoftware.kryo.io``, ``com.esotericsoftware.kryo5.io``, ``com.fasterxml.jackson.databind``, ``com.unboundid.ldap.sdk``, ``org.dom4j``, ``org.xml.sax``, ``org.xmlpull.v1``, ``play.mvc``",7,5,37,,,,,17,,
+   Totals,,84,821,91,13,6,7,,33,1,2
+
--- a/java/documentation/library-coverage/frameworks.csv
+++ b/java/documentation/library-coverage/frameworks.csv
@@ -0,0 +1,8 @@
+Framework name,URL,Package prefix
+Java Standard Library,,java.*
+Google,,com.google.common.*
+Apache,,org.apache.*
+Apache Commons IO,https://commons.apache.org/proper/commons-io/,org.apache.commons.io
+Android,,android.*
+Spring,https://spring.io/,org.springframework.*
+Java extensions,,javax.*
--- a/misc/scripts/library-coverage/compare-files.py
+++ b/misc/scripts/library-coverage/compare-files.py
@@ -0,0 +1,54 @@
+import sys
+import os
+import settings
+import difflib
+
+"""
+This script compares the generated CSV coverage files with the ones in the codebase.
+"""
+
+
+def check_file_exists(file):
+    if not os.path.exists(file):
+        print("Expected file '" + file + "' doesn't exist.", file=sys.stderr)
+        sys.exit(1)
+
+
+def ignore_line_ending(ch):
+    return difflib.IS_CHARACTER_JUNK(ch, ws=" \r\n")
+
+
+def compare_files(file1, file2):
+    has_differences = False
+    diff = difflib.ndiff(open(file1).readlines(),
+                         open(file2).readlines(), None, ignore_line_ending)
+    for line in diff:
+        if line.startswith("+") or line.startswith("-"):
+            print(line, end="", file=sys.stderr)
+            has_differences = True
+
+    if has_differences:
+        print("Error: The generated file doesn't match the one in the codebase. Please check and fix file '" +
+              file1 + "'.", file=sys.stderr)
+        sys.exit(1)
+
+
+languages = ['java']
+
+for lang in languages:
+    repo_output_rst = settings.repo_output_rst.format(language=lang)
+    repo_output_csv = settings.repo_output_csv.format(language=lang)
+
+    generated_output_rst = settings.generated_output_rst.format(language=lang)
+    generated_output_csv = settings.generated_output_csv.format(language=lang)
+
+    check_file_exists(repo_output_rst)
+    check_file_exists(repo_output_csv)
+    check_file_exists(generated_output_rst)
+    check_file_exists(generated_output_csv)
+
+    compare_files(repo_output_rst, generated_output_rst)
+    compare_files(repo_output_csv, generated_output_csv)
+
+    print("The generated files for '" + lang +
+          "' match the ones in the codebase.")
--- a/misc/scripts/library-coverage/generate-report.py
+++ b/misc/scripts/library-coverage/generate-report.py
@@ -0,0 +1,340 @@
+import subprocess
+import csv
+import sys
+import os
+import shutil
+import settings
+
+"""
+This script runs the CSV coverage report QL query, and transforms it to a more readable format.
+There are two main outputs: (i) a CSV file containing the coverage data, and (ii) an RST page containing the coverage
+data.
+ """
+
+
+def subprocess_run(cmd):
+    """Runs a command through subprocess.run, with a few tweaks. Raises an Exception if exit code != 0."""
+    return subprocess.run(cmd, capture_output=True, text=True, env=os.environ.copy(), check=True)
+
+
+def create_empty_database(lang, extension, database):
+    """Creates an empty database for the given language."""
+    subprocess_run(["codeql", "database", "init", "--language=" + lang,
+                   "--source-root=/tmp/empty", "--allow-missing-source-root", database])
+    subprocess_run(["mkdir", "-p", database + "/src/tmp/empty"])
+    subprocess_run(["touch", database + "/src/tmp/empty/empty" + extension])
+    subprocess_run(["codeql", "database", "finalize",
+                   database, "--no-pre-finalize"])
+
+
+def run_codeql_query(query, database, output):
+    """Runs a codeql query on the given database."""
+    subprocess_run(["codeql", "query", "run", query,
+                   "--database", database, "--output", output + ".bqrs"])
+    subprocess_run(["codeql", "bqrs", "decode", output + ".bqrs",
+                   "--format=csv", "--no-titles", "--output", output])
+    os.remove(output + ".bqrs")
+
+
+def append_csv_number(list, value):
+    """Adds a number to the list or None if the value is not greater than 0."""
+    if value > 0:
+        list.append(value)
+    else:
+        list.append(None)
+
+
+def append_csv_dict_item(list, dictionary, key):
+    """Adds a dictionary item to the list if the key is in the dictionary."""
+    if key in dictionary:
+        list.append(dictionary[key])
+    else:
+        list.append(None)
+
+
+def increment_dict_item(value, dictionary, key):
+    """Increments the value of the dictionary[key] by value."""
+    if key not in dictionary:
+        dictionary[key] = 0
+    dictionary[key] += int(value)
+
+
+def collect_package_stats(packages, cwes, filter):
+    """
+    Collects coverage statistics for packages matching the given filter. `filter` is a `lambda` that for example (i) matches
+    packages to frameworks, or (2) matches packages that were previously not processed.
+
+    The returned statistics are used to generate a single row in a CSV file.
+    """
+    sources = 0
+    steps = 0
+    sinks = 0
+    framework_cwes = {}
+    processed_packages = set()
+
+    for package in packages:
+        if filter(package):
+            processed_packages.add(package)
+            sources += int(packages[package]["kind"].get("source:remote", 0))
+            steps += int(packages[package]["part"].get("summary", 0))
+            sinks += int(packages[package]["part"].get("sink", 0))
+
+            for cwe in cwes:
+                sink = "sink:" + cwes[cwe]["sink"]
+                if sink in packages[package]["kind"]:
+                    if cwe not in framework_cwes:
+                        framework_cwes[cwe] = 0
+                    framework_cwes[cwe] += int(
+                        packages[package]["kind"][sink])
+
+    return sources, steps, sinks, framework_cwes, processed_packages
+
+
+def add_package_stats_to_row(row, sorted_cwes, collect):
+    """
+    Adds collected statistic to the row. `collect` is a `lambda` that returns the statistics for example for (i) individual
+    frameworks, (ii) leftout frameworks summarized in the 'Others' row, or (iii) all frameworks summarized in the 'Totals'
+    row.
+    """
+    sources, steps, sinks, framework_cwes, processed_packages = collect()
+
+    append_csv_number(row, sources)
+    append_csv_number(row, steps)
+    append_csv_number(row, sinks)
+
+    for cwe in sorted_cwes:
+        append_csv_dict_item(row, framework_cwes, cwe)
+
+    return row, processed_packages
+
+
+class LanguageConfig:
+    def __init__(self, lang, capitalized_lang, ext, ql_path):
+        self.lang = lang
+        self.capitalized_lang = capitalized_lang
+        self.ext = ext
+        self.ql_path = ql_path
+
+
+try:  # Check for `codeql` on path
+    subprocess_run(["codeql", "--version"])
+except Exception as e:
+    print("Error: couldn't invoke CodeQL CLI 'codeql'. Is it on the path? Aborting.", file=sys.stderr)
+    raise e
+
+# The script can be run in two modes:
+# (i) dev: run on the local developer machine, and collect the coverage data. The output is generated into the expected
+#          folders: {language}/documentation/library-coverage/
+# (ii) ci: run in a CI action. The output is generated to the root folder, and then in a subsequent step packaged as a
+#          build artifact.
+mode = "dev"
+if len(sys.argv) > 1:
+    mode = sys.argv[1]
+
+if mode != "dev" and mode != "ci":
+    print("Unknown execution mode: " + mode +
+          ". Expected either 'dev' or 'ci'.", file=sys.stderr)
+    exit(1)
+
+# The QL model holding the CSV info can come from directly a PR or the main branch, but optionally we can use an earlier
+# SHA too, therefore it's checked out seperately into a dedicated subfolder.
+query_prefix = ""
+if len(sys.argv) > 2:
+    query_prefix = sys.argv[2] + "/"
+
+
+# Languages for which we want to generate coverage reports.
+configs = [
+    LanguageConfig(
+        "java", "Java", ".java", query_prefix + "java/ql/src/meta/frameworks/Coverage.ql")
+]
+
+# The names of input and output files. The placeholder {language} is replaced with the language name.
+output_ql_csv = "output-{language}.csv"
+input_framework_csv = settings.documentation_folder + "frameworks.csv"
+input_cwe_sink_csv = settings.documentation_folder + "cwe-sink.csv"
+
+if mode == "dev":
+    output_rst = settings.repo_output_rst
+    output_csv = settings.repo_output_csv
+else:
+    output_rst = settings.generated_output_rst
+    output_csv = settings.generated_output_csv
+
+for config in configs:
+    lang = config.lang
+    db = "empty-" + lang
+    ql_output = output_ql_csv.format(language=lang)
+    create_empty_database(lang, config.ext, db)
+    run_codeql_query(config.ql_path, db, ql_output)
+    shutil.rmtree(db)
+
+    packages = {}
+    parts = set()
+    kinds = set()
+
+    # Read the generated CSV file, and collect package statistics.
+    with open(ql_output) as csvfile:
+        reader = csv.reader(csvfile)
+        for row in reader:
+            # row: "android.util",1,"remote","source",16
+            package = row[0]
+            if package not in packages:
+                packages[package] = {
+                    "count": row[1],
+                    # part: "summary", "sink", or "source"
+                    "part": {},
+                    # kind: "source:remote", "sink:create-file", ...
+                    "kind": {}
+                }
+
+            part = row[3]
+            parts.add(part)
+            increment_dict_item(row[4], packages[package]["part"], part)
+
+            kind = part + ":" + row[2]
+            kinds.add(kind)
+            increment_dict_item(row[4], packages[package]["kind"], kind)
+
+    os.remove(ql_output)
+
+    parts = sorted(parts)
+    kinds = sorted(kinds)
+
+    # Write the denormalized package statistics to a CSV file.
+    with open(output_csv.format(language=lang), 'w', newline='') as csvfile:
+        csvwriter = csv.writer(csvfile)
+
+        headers = ["package"]
+        headers.extend(parts)
+        headers.extend(kinds)
+
+        csvwriter.writerow(headers)
+
+        for package in sorted(packages):
+            row = [package]
+            for part in parts:
+                append_csv_dict_item(row, packages[package]["part"], part)
+            for kind in kinds:
+                append_csv_dict_item(row, packages[package]["kind"], kind)
+            csvwriter.writerow(row)
+
+    # Read the additional framework data, such as URL, friendly name
+    frameworks = {}
+
+    with open(input_framework_csv.format(language=lang)) as csvfile:
+        reader = csv.reader(csvfile)
+        next(reader)
+        for row in reader:
+            # row: Hibernate,https://hibernate.org/,org.hibernate
+            framwork = row[0]
+            if framwork not in frameworks:
+                frameworks[framwork] = {
+                    "package": row[2],
+                    "url": row[1]
+                }
+
+    # Read the additional CWE data
+    cwes = {}
+
+    with open(input_cwe_sink_csv.format(language=lang)) as csvfile:
+        reader = csv.reader(csvfile)
+        next(reader)
+        for row in reader:
+            # row: CWE-89,sql,SQL injection
+            cwe = row[0]
+            if cwe not in cwes:
+                cwes[cwe] = {
+                    "sink": row[1],
+                    "label": row[2]
+                }
+
+    sorted_cwes = sorted(cwes)
+
+    with open(output_rst.format(language=lang), 'w', newline='') as rst_file:
+        rst_file.write(
+            config.capitalized_lang + " framework & library support\n")
+        rst_file.write("================================\n\n")
+        rst_file.write(".. csv-table::\n")
+        rst_file.write("   :header-rows: 1\n")
+        rst_file.write("   :class: fullWidthTable\n")
+        rst_file.write("   :widths: auto\n\n")
+
+        row_prefix = "   "
+
+        # Write CSV file with package statistics and framework data to be used in RST file.
+        csvwriter = csv.writer(rst_file)
+
+        # Write CSV header.
+        headers = [row_prefix + "Framework / library",
+                   "Package",
+                   "Remote flow sources",
+                   "Taint & value steps",
+                   "Sinks (total)"]
+        for cwe in sorted_cwes:
+            headers.append(
+                "`{0}` :sub:`{1}`".format(cwe, cwes[cwe]["label"]))
+        csvwriter.writerow(headers)
+
+        processed_packages = set()
+
+        all_package_patterns = set(
+            (frameworks[fr]["package"] for fr in frameworks))
+
+        # Write a row for each framework.
+        for framework in sorted(frameworks):
+            row = []
+
+            # Add the framework name to the row
+            if not frameworks[framework]["url"]:
+                row.append(row_prefix + framework)
+            else:
+                row.append(
+                    row_prefix + "`" + framework + " <" + frameworks[framework]["url"] + ">`_")
+
+            # Add the package name to the row
+            row.append("``" + frameworks[framework]["package"] + "``")
+
+            current_package_pattern = frameworks[framework]["package"]
+
+            # Collect statistics on the current framework
+            # current_package_pattern is either full name, such as "org.hibernate", or a prefix, such as "java.*"
+            # Package patterns might overlap, in case of 'org.apache.commons.io' and 'org.apache.*', the statistics for
+            # the latter will not include the statistics for the former.
+            def package_match(package_name, pattern): return (pattern.endswith(
+                "*") and package_name.startswith(pattern[:-1])) or (not pattern.endswith("*") and pattern == package_name)
+
+            def collect_framework(): return collect_package_stats(
+                packages, cwes, lambda p: package_match(p, current_package_pattern) and all(len(current_package_pattern) >= len(pattern) or not package_match(p, pattern) for pattern in all_package_patterns))
+
+            row, f_processed_packages = add_package_stats_to_row(
+                row, sorted_cwes, collect_framework)
+
+            csvwriter.writerow(row)
+            processed_packages.update(f_processed_packages)
+
+        # Collect statistics on all packages that are not part of a framework
+        row = [row_prefix + "Others", None]
+
+        def collect_others(): return collect_package_stats(
+            packages, cwes, lambda p: p not in processed_packages)
+
+        row, other_packages = add_package_stats_to_row(
+            row, sorted_cwes, collect_others)
+
+        row[1] = ", ".join("``{0}``".format(p)
+                           for p in sorted(other_packages))
+
+        csvwriter.writerow(row)
+
+        # Collect statistics on all packages
+        row = [row_prefix + "Totals", None]
+
+        def collect_total(): return collect_package_stats(packages, cwes, lambda p: True)
+
+        row, _ = add_package_stats_to_row(
+            row, sorted_cwes, collect_total)
+
+        csvwriter.writerow(row)
+
+        rst_file.write("\n")
--- a/misc/scripts/library-coverage/settings.py
+++ b/misc/scripts/library-coverage/settings.py
@@ -0,0 +1,20 @@
+import sys
+
+generated_output_rst = "flow-model-coverage-{language}.rst"
+generated_output_csv = "flow-model-coverage-{language}.csv"
+
+# The CI job checks out the codebase to a subfolder
+data_prefix = ""
+
+index = 1
+if sys.argv[0].endswith("generate-report.py"):
+    index = 3
+
+if len(sys.argv) > index:
+    data_prefix = sys.argv[index] + "/"
+
+documentation_folder = data_prefix + \
+    "{language}/documentation/library-coverage/"
+
+repo_output_rst = documentation_folder + "flow-model-coverage.rst"
+repo_output_csv = documentation_folder + "flow-model-coverage.csv"