Merge pull request #5832 from tamasvajk/feature/csv-coverage-report

Java: github action for CSV coverage report
This commit is contained in:
Tamás Vajk
2021-05-25 14:51:19 +02:00
committed by GitHub
8 changed files with 568 additions and 0 deletions

77
.github/workflows/csv-coverage.yml vendored Normal file
View File

@@ -0,0 +1,77 @@
name: Build/check CSV flow coverage report
on:
workflow_dispatch:
inputs:
qlModelShaOverride:
description: 'github/codeql repo SHA used for looking up the CSV models'
required: false
push:
branches:
- main
- 'rc/**'
pull_request:
paths:
- '.github/workflows/csv-coverage.yml'
- '*/ql/src/**/*.ql'
- '*/ql/src/**/*.qll'
- 'misc/scripts/library-coverage/*.py'
# input data files
- '*/documentation/library-coverage/cwe-sink.csv'
- '*/documentation/library-coverage/frameworks.csv'
# coverage report files
- '*/documentation/library-coverage/flow-model-coverage.csv'
- '*/documentation/library-coverage/flow-model-coverage.rst'
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Clone self (github/codeql)
uses: actions/checkout@v2
with:
path: script
- name: Clone self (github/codeql) at a given SHA for analysis
if: github.event.inputs.qlModelShaOverride != ''
uses: actions/checkout@v2
with:
path: codeqlModels
ref: github.event.inputs.qlModelShaOverride
- name: Clone self (github/codeql) for analysis
if: github.event.inputs.qlModelShaOverride == ''
uses: actions/checkout@v2
with:
path: codeqlModels
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: Download CodeQL CLI
uses: dsaltares/fetch-gh-release-asset@aa37ae5c44d3c9820bc12fe675e8670ecd93bd1c
with:
repo: "github/codeql-cli-binaries"
version: "latest"
file: "codeql-linux64.zip"
token: ${{ secrets.GITHUB_TOKEN }}
- name: Unzip CodeQL CLI
run: unzip -d codeql-cli codeql-linux64.zip
- name: Build modeled package list
run: |
PATH="$PATH:codeql-cli/codeql" python script/misc/scripts/library-coverage/generate-report.py ci codeqlModels script
- name: Upload CSV package list
uses: actions/upload-artifact@v2
with:
name: csv-flow-model-coverage
path: flow-model-coverage-*.csv
- name: Upload RST package list
uses: actions/upload-artifact@v2
with:
name: rst-flow-model-coverage
path: flow-model-coverage-*.rst
- name: Check coverage files
if: github.event.pull_request
run: |
python script/misc/scripts/library-coverage/compare-files.py codeqlModels

View File

@@ -0,0 +1,8 @@
CWE,Sink identifier,Label
CWE089,sql,SQL injection
CWE022,create-file,Path injection
CWE036,url-open-stream,Path traversal
CWE094,bean-validation,Code injection
CWE319,open-url,Cleartext transmission
CWE079,xss,Cross-site scripting
CWE090,ldap,LDAP injection
1 CWE Sink identifier Label
2 CWE‑089 sql SQL injection
3 CWE‑022 create-file Path injection
4 CWE‑036 url-open-stream Path traversal
5 CWE‑094 bean-validation Code injection
6 CWE‑319 open-url Cleartext transmission
7 CWE‑079 xss Cross-site scripting
8 CWE‑090 ldap LDAP injection

View File

@@ -0,0 +1,42 @@
package,sink,source,summary,sink:bean-validation,sink:create-file,sink:header-splitting,sink:ldap,sink:open-url,sink:set-hostname-verifier,sink:url-open-stream,sink:xpath,sink:xss,source:remote,summary:taint,summary:value
android.util,,16,,,,,,,,,,,16,,
android.webkit,3,2,,,,,,,,,,3,2,,
com.esotericsoftware.kryo.io,,,1,,,,,,,,,,,1,
com.esotericsoftware.kryo5.io,,,1,,,,,,,,,,,1,
com.fasterxml.jackson.databind,,,2,,,,,,,,,,,2,
com.google.common.base,,,28,,,,,,,,,,,22,6
com.google.common.io,6,,69,,,,,,,6,,,,68,1
com.unboundid.ldap.sdk,17,,,,,,17,,,,,,,,
java.beans,,,1,,,,,,,,,,,1,
java.io,3,,20,,3,,,,,,,,,20,
java.lang,,,1,,,,,,,,,,,1,
java.net,2,3,4,,,,,2,,,,,3,4,
java.nio,10,,2,,10,,,,,,,,,2,
java.util,,,13,,,,,,,,,,,13,
javax.naming.directory,1,,,,,,1,,,,,,,,
javax.net.ssl,2,,,,,,,,2,,,,,,
javax.servlet,4,21,2,,,3,,,,,,1,21,2,
javax.validation,1,1,,1,,,,,,,,,1,,
javax.ws.rs.core,1,,,,,1,,,,,,,,,
javax.xml.transform.sax,,,4,,,,,,,,,,,4,
javax.xml.transform.stream,,,2,,,,,,,,,,,2,
javax.xml.xpath,3,,,,,,,,,,3,,,,
org.apache.commons.codec,,,2,,,,,,,,,,,2,
org.apache.commons.io,,,22,,,,,,,,,,,22,
org.apache.commons.lang3,,,313,,,,,,,,,,,299,14
org.apache.commons.text,,,203,,,,,,,,,,,203,
org.apache.directory.ldap.client.api,1,,,,,,1,,,,,,,,
org.apache.hc.core5.function,,,1,,,,,,,,,,,1,
org.apache.hc.core5.http,1,2,39,,,,,,,,,1,2,39,
org.apache.hc.core5.net,,,2,,,,,,,,,,,2,
org.apache.hc.core5.util,,,22,,,,,,,,,,,18,4
org.apache.http,2,3,66,,,,,,,,,2,3,59,7
org.dom4j,20,,,,,,,,,,20,,,,
org.springframework.ldap.core,14,,,,,,14,,,,,,,,
org.springframework.security.web.savedrequest,,6,,,,,,,,,,,6,,
org.springframework.web.client,,3,,,,,,,,,,,3,,
org.springframework.web.context.request,,8,,,,,,,,,,,8,,
org.springframework.web.multipart,,12,,,,,,,,,,,12,,
org.xml.sax,,,1,,,,,,,,,,,1,
org.xmlpull.v1,,3,,,,,,,,,,,3,,
play.mvc,,4,,,,,,,,,,,4,,
1 package sink source summary sink:bean-validation sink:create-file sink:header-splitting sink:ldap sink:open-url sink:set-hostname-verifier sink:url-open-stream sink:xpath sink:xss source:remote summary:taint summary:value
2 android.util 16 16
3 android.webkit 3 2 3 2
4 com.esotericsoftware.kryo.io 1 1
5 com.esotericsoftware.kryo5.io 1 1
6 com.fasterxml.jackson.databind 2 2
7 com.google.common.base 28 22 6
8 com.google.common.io 6 69 6 68 1
9 com.unboundid.ldap.sdk 17 17
10 java.beans 1 1
11 java.io 3 20 3 20
12 java.lang 1 1
13 java.net 2 3 4 2 3 4
14 java.nio 10 2 10 2
15 java.util 13 13
16 javax.naming.directory 1 1
17 javax.net.ssl 2 2
18 javax.servlet 4 21 2 3 1 21 2
19 javax.validation 1 1 1 1
20 javax.ws.rs.core 1 1
21 javax.xml.transform.sax 4 4
22 javax.xml.transform.stream 2 2
23 javax.xml.xpath 3 3
24 org.apache.commons.codec 2 2
25 org.apache.commons.io 22 22
26 org.apache.commons.lang3 313 299 14
27 org.apache.commons.text 203 203
28 org.apache.directory.ldap.client.api 1 1
29 org.apache.hc.core5.function 1 1
30 org.apache.hc.core5.http 1 2 39 1 2 39
31 org.apache.hc.core5.net 2 2
32 org.apache.hc.core5.util 22 18 4
33 org.apache.http 2 3 66 2 3 59 7
34 org.dom4j 20 20
35 org.springframework.ldap.core 14 14
36 org.springframework.security.web.savedrequest 6 6
37 org.springframework.web.client 3 3
38 org.springframework.web.context.request 8 8
39 org.springframework.web.multipart 12 12
40 org.xml.sax 1 1
41 org.xmlpull.v1 3 3
42 play.mvc 4 4

View File

@@ -0,0 +1,19 @@
Java framework & library support
================================
.. csv-table::
:header-rows: 1
:class: fullWidthTable
:widths: auto
Framework / library,Package,Remote flow sources,Taint & value steps,Sinks (total),`CWE022` :sub:`Path injection`,`CWE036` :sub:`Path traversal`,`CWE079` :sub:`Cross-site scripting`,`CWE089` :sub:`SQL injection`,`CWE090` :sub:`LDAP injection`,`CWE094` :sub:`Code injection`,`CWE319` :sub:`Cleartext transmission`
Android,``android.*``,18,,3,,,3,,,,
Apache,``org.apache.*``,5,648,4,,,3,,1,,
`Apache Commons IO <https://commons.apache.org/proper/commons-io/>`_,``org.apache.commons.io``,,22,,,,,,,,
Google,``com.google.common.*``,,97,6,,6,,,,,
Java Standard Library,``java.*``,3,41,15,13,,,,,,2
Java extensions,``javax.*``,22,8,12,,,1,,1,1,
`Spring <https://spring.io/>`_,``org.springframework.*``,29,,14,,,,,14,,
Others,"``com.esotericsoftware.kryo.io``, ``com.esotericsoftware.kryo5.io``, ``com.fasterxml.jackson.databind``, ``com.unboundid.ldap.sdk``, ``org.dom4j``, ``org.xml.sax``, ``org.xmlpull.v1``, ``play.mvc``",7,5,37,,,,,17,,
Totals,,84,821,91,13,6,7,,33,1,2

View File

@@ -0,0 +1,8 @@
Framework name,URL,Package prefix
Java Standard Library,,java.*
Google,,com.google.common.*
Apache,,org.apache.*
Apache Commons IO,https://commons.apache.org/proper/commons-io/,org.apache.commons.io
Android,,android.*
Spring,https://spring.io/,org.springframework.*
Java extensions,,javax.*
1 Framework name URL Package prefix
2 Java Standard Library java.*
3 Google com.google.common.*
4 Apache org.apache.*
5 Apache Commons IO https://commons.apache.org/proper/commons-io/ org.apache.commons.io
6 Android android.*
7 Spring https://spring.io/ org.springframework.*
8 Java extensions javax.*

View File

@@ -0,0 +1,54 @@
import sys
import os
import settings
import difflib
"""
This script compares the generated CSV coverage files with the ones in the codebase.
"""
def check_file_exists(file):
if not os.path.exists(file):
print("Expected file '" + file + "' doesn't exist.", file=sys.stderr)
sys.exit(1)
def ignore_line_ending(ch):
return difflib.IS_CHARACTER_JUNK(ch, ws=" \r\n")
def compare_files(file1, file2):
has_differences = False
diff = difflib.ndiff(open(file1).readlines(),
open(file2).readlines(), None, ignore_line_ending)
for line in diff:
if line.startswith("+") or line.startswith("-"):
print(line, end="", file=sys.stderr)
has_differences = True
if has_differences:
print("Error: The generated file doesn't match the one in the codebase. Please check and fix file '" +
file1 + "'.", file=sys.stderr)
sys.exit(1)
languages = ['java']
for lang in languages:
repo_output_rst = settings.repo_output_rst.format(language=lang)
repo_output_csv = settings.repo_output_csv.format(language=lang)
generated_output_rst = settings.generated_output_rst.format(language=lang)
generated_output_csv = settings.generated_output_csv.format(language=lang)
check_file_exists(repo_output_rst)
check_file_exists(repo_output_csv)
check_file_exists(generated_output_rst)
check_file_exists(generated_output_csv)
compare_files(repo_output_rst, generated_output_rst)
compare_files(repo_output_csv, generated_output_csv)
print("The generated files for '" + lang +
"' match the ones in the codebase.")

View File

@@ -0,0 +1,340 @@
import subprocess
import csv
import sys
import os
import shutil
import settings
"""
This script runs the CSV coverage report QL query, and transforms it to a more readable format.
There are two main outputs: (i) a CSV file containing the coverage data, and (ii) an RST page containing the coverage
data.
"""
def subprocess_run(cmd):
"""Runs a command through subprocess.run, with a few tweaks. Raises an Exception if exit code != 0."""
return subprocess.run(cmd, capture_output=True, text=True, env=os.environ.copy(), check=True)
def create_empty_database(lang, extension, database):
"""Creates an empty database for the given language."""
subprocess_run(["codeql", "database", "init", "--language=" + lang,
"--source-root=/tmp/empty", "--allow-missing-source-root", database])
subprocess_run(["mkdir", "-p", database + "/src/tmp/empty"])
subprocess_run(["touch", database + "/src/tmp/empty/empty" + extension])
subprocess_run(["codeql", "database", "finalize",
database, "--no-pre-finalize"])
def run_codeql_query(query, database, output):
"""Runs a codeql query on the given database."""
subprocess_run(["codeql", "query", "run", query,
"--database", database, "--output", output + ".bqrs"])
subprocess_run(["codeql", "bqrs", "decode", output + ".bqrs",
"--format=csv", "--no-titles", "--output", output])
os.remove(output + ".bqrs")
def append_csv_number(list, value):
"""Adds a number to the list or None if the value is not greater than 0."""
if value > 0:
list.append(value)
else:
list.append(None)
def append_csv_dict_item(list, dictionary, key):
"""Adds a dictionary item to the list if the key is in the dictionary."""
if key in dictionary:
list.append(dictionary[key])
else:
list.append(None)
def increment_dict_item(value, dictionary, key):
"""Increments the value of the dictionary[key] by value."""
if key not in dictionary:
dictionary[key] = 0
dictionary[key] += int(value)
def collect_package_stats(packages, cwes, filter):
"""
Collects coverage statistics for packages matching the given filter. `filter` is a `lambda` that for example (i) matches
packages to frameworks, or (2) matches packages that were previously not processed.
The returned statistics are used to generate a single row in a CSV file.
"""
sources = 0
steps = 0
sinks = 0
framework_cwes = {}
processed_packages = set()
for package in packages:
if filter(package):
processed_packages.add(package)
sources += int(packages[package]["kind"].get("source:remote", 0))
steps += int(packages[package]["part"].get("summary", 0))
sinks += int(packages[package]["part"].get("sink", 0))
for cwe in cwes:
sink = "sink:" + cwes[cwe]["sink"]
if sink in packages[package]["kind"]:
if cwe not in framework_cwes:
framework_cwes[cwe] = 0
framework_cwes[cwe] += int(
packages[package]["kind"][sink])
return sources, steps, sinks, framework_cwes, processed_packages
def add_package_stats_to_row(row, sorted_cwes, collect):
"""
Adds collected statistic to the row. `collect` is a `lambda` that returns the statistics for example for (i) individual
frameworks, (ii) leftout frameworks summarized in the 'Others' row, or (iii) all frameworks summarized in the 'Totals'
row.
"""
sources, steps, sinks, framework_cwes, processed_packages = collect()
append_csv_number(row, sources)
append_csv_number(row, steps)
append_csv_number(row, sinks)
for cwe in sorted_cwes:
append_csv_dict_item(row, framework_cwes, cwe)
return row, processed_packages
class LanguageConfig:
def __init__(self, lang, capitalized_lang, ext, ql_path):
self.lang = lang
self.capitalized_lang = capitalized_lang
self.ext = ext
self.ql_path = ql_path
try: # Check for `codeql` on path
subprocess_run(["codeql", "--version"])
except Exception as e:
print("Error: couldn't invoke CodeQL CLI 'codeql'. Is it on the path? Aborting.", file=sys.stderr)
raise e
# The script can be run in two modes:
# (i) dev: run on the local developer machine, and collect the coverage data. The output is generated into the expected
# folders: {language}/documentation/library-coverage/
# (ii) ci: run in a CI action. The output is generated to the root folder, and then in a subsequent step packaged as a
# build artifact.
mode = "dev"
if len(sys.argv) > 1:
mode = sys.argv[1]
if mode != "dev" and mode != "ci":
print("Unknown execution mode: " + mode +
". Expected either 'dev' or 'ci'.", file=sys.stderr)
exit(1)
# The QL model holding the CSV info can come from directly a PR or the main branch, but optionally we can use an earlier
# SHA too, therefore it's checked out seperately into a dedicated subfolder.
query_prefix = ""
if len(sys.argv) > 2:
query_prefix = sys.argv[2] + "/"
# Languages for which we want to generate coverage reports.
configs = [
LanguageConfig(
"java", "Java", ".java", query_prefix + "java/ql/src/meta/frameworks/Coverage.ql")
]
# The names of input and output files. The placeholder {language} is replaced with the language name.
output_ql_csv = "output-{language}.csv"
input_framework_csv = settings.documentation_folder + "frameworks.csv"
input_cwe_sink_csv = settings.documentation_folder + "cwe-sink.csv"
if mode == "dev":
output_rst = settings.repo_output_rst
output_csv = settings.repo_output_csv
else:
output_rst = settings.generated_output_rst
output_csv = settings.generated_output_csv
for config in configs:
lang = config.lang
db = "empty-" + lang
ql_output = output_ql_csv.format(language=lang)
create_empty_database(lang, config.ext, db)
run_codeql_query(config.ql_path, db, ql_output)
shutil.rmtree(db)
packages = {}
parts = set()
kinds = set()
# Read the generated CSV file, and collect package statistics.
with open(ql_output) as csvfile:
reader = csv.reader(csvfile)
for row in reader:
# row: "android.util",1,"remote","source",16
package = row[0]
if package not in packages:
packages[package] = {
"count": row[1],
# part: "summary", "sink", or "source"
"part": {},
# kind: "source:remote", "sink:create-file", ...
"kind": {}
}
part = row[3]
parts.add(part)
increment_dict_item(row[4], packages[package]["part"], part)
kind = part + ":" + row[2]
kinds.add(kind)
increment_dict_item(row[4], packages[package]["kind"], kind)
os.remove(ql_output)
parts = sorted(parts)
kinds = sorted(kinds)
# Write the denormalized package statistics to a CSV file.
with open(output_csv.format(language=lang), 'w', newline='') as csvfile:
csvwriter = csv.writer(csvfile)
headers = ["package"]
headers.extend(parts)
headers.extend(kinds)
csvwriter.writerow(headers)
for package in sorted(packages):
row = [package]
for part in parts:
append_csv_dict_item(row, packages[package]["part"], part)
for kind in kinds:
append_csv_dict_item(row, packages[package]["kind"], kind)
csvwriter.writerow(row)
# Read the additional framework data, such as URL, friendly name
frameworks = {}
with open(input_framework_csv.format(language=lang)) as csvfile:
reader = csv.reader(csvfile)
next(reader)
for row in reader:
# row: Hibernate,https://hibernate.org/,org.hibernate
framwork = row[0]
if framwork not in frameworks:
frameworks[framwork] = {
"package": row[2],
"url": row[1]
}
# Read the additional CWE data
cwes = {}
with open(input_cwe_sink_csv.format(language=lang)) as csvfile:
reader = csv.reader(csvfile)
next(reader)
for row in reader:
# row: CWE-89,sql,SQL injection
cwe = row[0]
if cwe not in cwes:
cwes[cwe] = {
"sink": row[1],
"label": row[2]
}
sorted_cwes = sorted(cwes)
with open(output_rst.format(language=lang), 'w', newline='') as rst_file:
rst_file.write(
config.capitalized_lang + " framework & library support\n")
rst_file.write("================================\n\n")
rst_file.write(".. csv-table::\n")
rst_file.write(" :header-rows: 1\n")
rst_file.write(" :class: fullWidthTable\n")
rst_file.write(" :widths: auto\n\n")
row_prefix = " "
# Write CSV file with package statistics and framework data to be used in RST file.
csvwriter = csv.writer(rst_file)
# Write CSV header.
headers = [row_prefix + "Framework / library",
"Package",
"Remote flow sources",
"Taint & value steps",
"Sinks (total)"]
for cwe in sorted_cwes:
headers.append(
"`{0}` :sub:`{1}`".format(cwe, cwes[cwe]["label"]))
csvwriter.writerow(headers)
processed_packages = set()
all_package_patterns = set(
(frameworks[fr]["package"] for fr in frameworks))
# Write a row for each framework.
for framework in sorted(frameworks):
row = []
# Add the framework name to the row
if not frameworks[framework]["url"]:
row.append(row_prefix + framework)
else:
row.append(
row_prefix + "`" + framework + " <" + frameworks[framework]["url"] + ">`_")
# Add the package name to the row
row.append("``" + frameworks[framework]["package"] + "``")
current_package_pattern = frameworks[framework]["package"]
# Collect statistics on the current framework
# current_package_pattern is either full name, such as "org.hibernate", or a prefix, such as "java.*"
# Package patterns might overlap, in case of 'org.apache.commons.io' and 'org.apache.*', the statistics for
# the latter will not include the statistics for the former.
def package_match(package_name, pattern): return (pattern.endswith(
"*") and package_name.startswith(pattern[:-1])) or (not pattern.endswith("*") and pattern == package_name)
def collect_framework(): return collect_package_stats(
packages, cwes, lambda p: package_match(p, current_package_pattern) and all(len(current_package_pattern) >= len(pattern) or not package_match(p, pattern) for pattern in all_package_patterns))
row, f_processed_packages = add_package_stats_to_row(
row, sorted_cwes, collect_framework)
csvwriter.writerow(row)
processed_packages.update(f_processed_packages)
# Collect statistics on all packages that are not part of a framework
row = [row_prefix + "Others", None]
def collect_others(): return collect_package_stats(
packages, cwes, lambda p: p not in processed_packages)
row, other_packages = add_package_stats_to_row(
row, sorted_cwes, collect_others)
row[1] = ", ".join("``{0}``".format(p)
for p in sorted(other_packages))
csvwriter.writerow(row)
# Collect statistics on all packages
row = [row_prefix + "Totals", None]
def collect_total(): return collect_package_stats(packages, cwes, lambda p: True)
row, _ = add_package_stats_to_row(
row, sorted_cwes, collect_total)
csvwriter.writerow(row)
rst_file.write("\n")

View File

@@ -0,0 +1,20 @@
import sys
generated_output_rst = "flow-model-coverage-{language}.rst"
generated_output_csv = "flow-model-coverage-{language}.csv"
# The CI job checks out the codebase to a subfolder
data_prefix = ""
index = 1
if sys.argv[0].endswith("generate-report.py"):
index = 3
if len(sys.argv) > index:
data_prefix = sys.argv[index] + "/"
documentation_folder = data_prefix + \
"{language}/documentation/library-coverage/"
repo_output_rst = documentation_folder + "flow-model-coverage.rst"
repo_output_csv = documentation_folder + "flow-model-coverage.csv"