Rework CSV report generator and change timeseries report to use framework.csv

This commit is contained in:
Tamas Vajk
2021-06-08 14:51:49 +02:00
parent c6cb7c6eed
commit ba9c2e0702
5 changed files with 323 additions and 160 deletions

View File

@@ -0,0 +1,72 @@
import csv
import sys
import packages
class Framework:
"""
Frameworks are the aggregation units in the RST and timeseries report. These are read from the frameworks.csv file.
"""
def __init__(self, name, url, package_pattern):
self.name = name
self.url = url
self.package_pattern = package_pattern
class FrameworkCollection:
"""
A (sorted) list of frameworks.
"""
def __init__(self, path):
self.frameworks: list[Framework] = []
self.package_patterns = set()
with open(path) as csvfile:
reader = csv.reader(csvfile)
next(reader)
for row in reader:
# row: Hibernate,https://hibernate.org/,org.hibernate
self.__add(Framework(row[0], row[1], row[2]))
self.__sort()
def __add(self, framework: Framework):
if framework.package_pattern not in self.package_patterns:
self.package_patterns.add(framework.package_pattern)
self.frameworks.append(framework)
else:
print("Package pattern already exists: " +
framework.package_pattern, file=sys.stderr)
def __sort(self):
self.frameworks.sort(key=lambda f: f.name)
def get(self, framework_name):
for framework in self.frameworks:
if framework.name == framework_name:
return framework
return None
def get_patterns(self):
return self.package_patterns
def get_frameworks(self):
return self.frameworks
def __package_match(self, package: packages.Package, pattern):
return (pattern.endswith("*") and package.name.startswith(pattern[:-1])) or (not pattern.endswith("*") and pattern == package.name)
def get_package_filter(self, framework: Framework):
"""
Returns a lambda filter that holds for packages that match the current framework.
The pattern is either full name, such as "org.hibernate", or a prefix, such as "java.*"
Package patterns might overlap, in case of 'org.apache.commons.io' and 'org.apache.*', the statistics for
the latter will not include the statistics for the former.
"""
return lambda p: \
self.__package_match(p, framework.package_pattern) and \
all(
len(framework.package_pattern) >= len(pattern) or
not self.__package_match(p, pattern) for pattern in self.get_patterns())

View File

@@ -4,6 +4,8 @@ import os
import shutil
import settings
import utils
import packages as pack
import frameworks as fr
"""
This script runs the CSV coverage report QL query, and transforms it to a more readable format.
@@ -28,14 +30,7 @@ def append_csv_dict_item(list, dictionary, key):
list.append(None)
def increment_dict_item(value, dictionary, key):
"""Increments the value of the dictionary[key] by value."""
if key not in dictionary:
dictionary[key] = 0
dictionary[key] += int(value)
def collect_package_stats(packages, cwes, filter):
def collect_package_stats(packages: pack.PackageCollection, cwes, filter):
"""
Collects coverage statistics for packages matching the given filter. `filter` is a `lambda` that for example (i) matches
packages to frameworks, or (2) matches packages that were previously not processed.
@@ -48,20 +43,21 @@ def collect_package_stats(packages, cwes, filter):
framework_cwes = {}
processed_packages = set()
for package in packages:
for package in packages.get_packages():
package: pack.Package = package
if filter(package):
processed_packages.add(package)
sources += int(packages[package]["kind"].get("source:remote", 0))
steps += int(packages[package]["part"].get("summary", 0))
sinks += int(packages[package]["part"].get("sink", 0))
sources += package.get_kind_count("source:remote")
steps += package.get_part_count("summary")
sinks += package.get_part_count("sink")
for cwe in cwes:
sink = "sink:" + cwes[cwe]["sink"]
if sink in packages[package]["kind"]:
count = package.get_kind_count(sink)
if count > 0:
if cwe not in framework_cwes:
framework_cwes[cwe] = 0
framework_cwes[cwe] += int(
packages[package]["kind"][sink])
framework_cwes[cwe] += count
return sources, steps, sinks, framework_cwes, processed_packages
@@ -137,37 +133,12 @@ for config in configs:
utils.run_codeql_query(config.ql_path, db, ql_output)
shutil.rmtree(db)
packages = {}
parts = set()
kinds = set()
# Read the generated CSV file, and collect package statistics.
with open(ql_output) as csvfile:
reader = csv.reader(csvfile)
for row in reader:
# row: "android.util",1,"remote","source",16
package = row[0]
if package not in packages:
packages[package] = {
"count": row[1],
# part: "summary", "sink", or "source"
"part": {},
# kind: "source:remote", "sink:create-file", ...
"kind": {}
}
part = row[3]
parts.add(part)
increment_dict_item(row[4], packages[package]["part"], part)
kind = part + ":" + row[2]
kinds.add(kind)
increment_dict_item(row[4], packages[package]["kind"], kind)
packages = pack.PackageCollection(ql_output)
os.remove(ql_output)
parts = sorted(parts)
kinds = sorted(kinds)
parts = packages.get_parts()
kinds = packages.get_kinds()
# Write the denormalized package statistics to a CSV file.
with open(output_csv.format(language=lang), 'w', newline='') as csvfile:
@@ -179,44 +150,21 @@ for config in configs:
csvwriter.writerow(headers)
for package in sorted(packages):
row = [package]
for package in packages.get_packages():
package: pack.Package = package
row = [package.name]
for part in parts:
append_csv_dict_item(row, packages[package]["part"], part)
append_csv_number(row, package.get_part_count(part))
for kind in kinds:
append_csv_dict_item(row, packages[package]["kind"], kind)
append_csv_number(row, package.get_kind_count(kind))
csvwriter.writerow(row)
# Read the additional framework data, such as URL, friendly name
frameworks = {}
with open(input_framework_csv.format(language=lang)) as csvfile:
reader = csv.reader(csvfile)
next(reader)
for row in reader:
# row: Hibernate,https://hibernate.org/,org.hibernate
framwork = row[0]
if framwork not in frameworks:
frameworks[framwork] = {
"package": row[2],
"url": row[1]
}
frameworks = fr.FrameworkCollection(
input_framework_csv.format(language=lang))
# Read the additional CWE data
cwes = {}
with open(input_cwe_sink_csv.format(language=lang)) as csvfile:
reader = csv.reader(csvfile)
next(reader)
for row in reader:
# row: CWE-89,sql,SQL injection
cwe = row[0]
if cwe not in cwes:
cwes[cwe] = {
"sink": row[1],
"label": row[2]
}
cwes = utils.read_cwes(input_cwe_sink_csv.format(language=lang))
sorted_cwes = sorted(cwes)
with open(output_rst.format(language=lang), 'w', newline='') as rst_file:
@@ -246,34 +194,24 @@ for config in configs:
processed_packages = set()
all_package_patterns = set(
(frameworks[fr]["package"] for fr in frameworks))
# Write a row for each framework.
for framework in sorted(frameworks):
for framework in frameworks.get_frameworks():
framework: fr.Framework = framework
row = []
# Add the framework name to the row
if not frameworks[framework]["url"]:
row.append(row_prefix + framework)
if not framework.url:
row.append(row_prefix + framework.name)
else:
row.append(
row_prefix + "`" + framework + " <" + frameworks[framework]["url"] + ">`_")
row_prefix + "`" + framework.name + " <" + framework.url + ">`_")
# Add the package name to the row
row.append("``" + frameworks[framework]["package"] + "``")
current_package_pattern = frameworks[framework]["package"]
row.append("``" + framework.package_pattern + "``")
# Collect statistics on the current framework
# current_package_pattern is either full name, such as "org.hibernate", or a prefix, such as "java.*"
# Package patterns might overlap, in case of 'org.apache.commons.io' and 'org.apache.*', the statistics for
# the latter will not include the statistics for the former.
def package_match(package_name, pattern): return (pattern.endswith(
"*") and package_name.startswith(pattern[:-1])) or (not pattern.endswith("*") and pattern == package_name)
def collect_framework(): return collect_package_stats(
packages, cwes, lambda p: package_match(p, current_package_pattern) and all(len(current_package_pattern) >= len(pattern) or not package_match(p, pattern) for pattern in all_package_patterns))
packages, cwes, frameworks.get_package_filter(framework))
row, f_processed_packages = add_package_stats_to_row(
row, sorted_cwes, collect_framework)
@@ -290,8 +228,8 @@ for config in configs:
row, other_packages = add_package_stats_to_row(
row, sorted_cwes, collect_others)
row[1] = ", ".join("``{0}``".format(p)
for p in sorted(other_packages))
row[1] = ", ".join("``{0}``".format(p.name)
for p in sorted(other_packages, key=lambda x: x.name))
csvwriter.writerow(row)

View File

@@ -6,42 +6,43 @@ import shutil
from datetime import date
import datetime
import utils
import settings
import packages as pack
import frameworks as fr
"""
Gets the sink/source/summary statistics for different days.
Gets the sink/source/summary statistics for different days.
"""
# the distance between commits to include in the output
day_distance = 1
def get_str_output(arr):
r = subprocess.check_output(arr)
return r.decode("utf-8").strip("\n'")
class Git:
def get_output(arr):
r = subprocess.check_output(arr, text=True, env=os.environ.copy())
return r.strip("\n'")
def get_date(sha):
d = Git.get_output(
["git", "show", "--no-patch", "--no-notes", "--pretty='%cd'", "--date=short", sha])
return date.fromisoformat(d)
def get_parent(sha):
parent_sha = Git.get_output(
["git", "rev-parse", sha + "^"])
parent_date = Git.get_date(parent_sha)
return (parent_sha, parent_date)
def get_previous_sha(sha, date):
parent_sha, parent_date = Git.get_parent(sha)
while parent_date > date + datetime.timedelta(days=-1 * day_distance):
parent_sha, parent_date = Git.get_parent(parent_sha)
return (parent_sha, parent_date)
def get_date(sha):
d = get_str_output(
["git", "show", "--no-patch", "--no-notes", "--pretty='%cd'", "--date=short", sha])
return date.fromisoformat(d)
def get_parent(sha, date):
parent_sha = get_str_output(
["git", "rev-parse", sha + "^"])
parent_date = get_date(parent_sha)
return (parent_sha, parent_date)
def get_previous_sha(sha, date):
parent_sha, parent_date = get_parent(sha, date)
while parent_date > date + datetime.timedelta(days=-1 * day_distance):
parent_sha, parent_date = get_parent(parent_sha, parent_date)
return (parent_sha, parent_date)
def get_stats(lang, query):
def get_packages(lang, query):
try:
db = "empty_" + lang
ql_output = "output-" + lang + ".csv"
@@ -50,41 +51,14 @@ def get_stats(lang, query):
utils.create_empty_database(lang, ".java", db)
utils.run_codeql_query(query, db, ql_output)
sources = 0
sinks = 0
summaries = 0
packages = {}
with open(ql_output) as csvfile:
reader = csv.reader(csvfile)
for row in reader:
# row: "android.util",1,"remote","source",16
package = row[0]
if package not in packages:
packages[package] = {
"sources": 0,
"sinks": 0,
"summaries": 0
}
if row[3] == "source":
sources += int(row[4])
packages[package]["sources"] += int(row[4])
if row[3] == "sink":
sinks += int(row[4])
packages[package]["sinks"] += int(row[4])
if row[3] == "summary":
summaries += int(row[4])
packages[package]["summaries"] += int(row[4])
os.remove(ql_output)
return (sources, sinks, summaries, packages)
return pack.PackageCollection(ql_output)
except:
print("Unexpected error:", sys.exc_info()[0])
raise Exception()
finally:
if os.path.isfile(ql_output):
os.remove(ql_output)
if os.path.isdir(db):
shutil.rmtree(db)
@@ -108,28 +82,79 @@ for config in configs:
csvwriter_total.writerow(
["SHA", "Date", "Sources", "Sinks", "Summaries"])
csvwriter_packages.writerow(
["SHA", "Date", "Package", "Sources", "Sinks", "Summaries"])
["SHA", "Date", "Framework", "Package", "Sources", "Sinks", "Summaries"])
os.chdir(working_dir)
utils.subprocess_run(["git", "checkout", "main"])
current_sha = get_str_output(["git", "rev-parse", "HEAD"])
current_date = get_date(current_sha)
current_sha = Git.get_output(["git", "rev-parse", "HEAD"])
current_date = Git.get_date(current_sha)
# Read the additional framework data, such as URL, friendly name from the latest commit
input_framework_csv = settings.documentation_folder_no_prefix + "frameworks.csv"
frameworks = fr.FrameworkCollection(
input_framework_csv.format(language=config.lang))
while True:
print("Getting stats for " + current_sha)
utils.subprocess_run(["git", "checkout", current_sha])
try:
stats = get_stats(config.lang, config.ql_path)
packages = get_packages(config.lang, config.ql_path)
csvwriter_total.writerow(
[current_sha, current_date, stats[0], stats[1], stats[2]])
csvwriter_total.writerow([
current_sha,
current_date,
packages.get_part_count("source"),
packages.get_part_count("sink"),
packages.get_part_count("summary")])
for package in stats[3]:
csvwriter_packages.writerow(
[current_sha, current_date, package, stats[3][package]["sources"], stats[3][package]["sinks"], stats[3][package]["summaries"]])
matched_packages = set()
for framework in frameworks.get_frameworks():
framework: fr.Framework = framework
row = [current_sha, current_date,
framework.name, framework.package_pattern]
sources = 0
sinks = 0
summaries = 0
for package in packages.get_packages():
if frameworks.get_package_filter(framework)(package):
sources += package.get_part_count("source")
sinks += package.get_part_count("sink")
summaries += package.get_part_count("summary")
matched_packages.add(package.name)
row.append(sources)
row.append(sinks)
row.append(summaries)
csvwriter_packages.writerow(row)
row = [current_sha, current_date, "Others"]
sources = 0
sinks = 0
summaries = 0
other_packages = set()
for package in packages.get_packages():
if not package.name in matched_packages:
sources += package.get_part_count("source")
sinks += package.get_part_count("sink")
summaries += package.get_part_count("summary")
other_packages.add(package.name)
row.append(", ".join(sorted(other_packages)))
row.append(sources)
row.append(sinks)
row.append(summaries)
csvwriter_packages.writerow(row)
print("Collected stats for " + current_sha +
" at " + current_date.isoformat())
@@ -138,7 +163,7 @@ for config in configs:
current_sha + ". Stopping iteration.")
break
current_sha, current_date = get_previous_sha(
current_sha, current_date = Git.get_previous_sha(
current_sha, current_date)
utils.subprocess_run(["git", "checkout", "main"])

View File

@@ -0,0 +1,110 @@
import csv
class PackagePart:
"""
Represents a single package part with its count returned from the QL query, such as:
"android.util",1,"remote","source",16
"""
def __init__(self, package, kind, part, count):
self.package = package
# "summary", "sink", or "source"
self.part = part
# "source:remote", "sink:create-file", ...
self.kind = part + ":" + kind
self.count = int(count)
class Package:
"""
Represents an entire package with multiple parts returned from the QL query.
"""
def __init__(self, name, package_count):
self.parts: list[PackagePart] = []
self.name = name
self.package_count = int(package_count)
def add_part(self, part: PackagePart):
self.parts.append(part)
def get_part_count(self, p):
count = 0
for part in self.parts:
if part.part == p:
count += part.count
return count
def get_kind_count(self, k):
count = 0
for part in self.parts:
if part.kind == k:
count += part.count
return count
class PackageCollection:
"""
A (sorted) list of packages. Packages are returned by the QL query in the form:
"android.util",1,"remote","source",16
And then the denormalized rows are aggregated by packages.
"""
def __init__(self, ql_output_path):
self.packages: list[Package] = []
self.package_names = set()
# Read the generated CSV file, and collect package statistics.
with open(ql_output_path) as csvfile:
reader = csv.reader(csvfile)
for row in reader:
# row: "android.util",1,"remote","source",16
package = self.__get_or_create_package(row[0], row[1])
packagePart = PackagePart(
package, row[2], row[3], row[4])
package.add_part(packagePart)
self.__sort()
def __sort(self):
self.packages.sort(key=lambda f: f.name)
def get_packages(self):
return self.packages
def __get_or_create_package(self, package_name, package_count):
if package_name not in self.package_names:
self.package_names.add(package_name)
package = Package(package_name, package_count)
self.packages.append(package)
return package
else:
for package in self.packages:
if package.name == package_name:
return package
return None
def get_parts(self):
parts = set()
for package in self.packages:
for part in package.parts:
parts.add(part.part)
return sorted(parts)
def get_kinds(self):
kinds = set()
for package in self.packages:
for part in package.parts:
kinds.add(part.kind)
return sorted(kinds)
def get_part_count(self, p):
count = 0
for package in self.packages:
count += package.get_part_count(p)
return count

View File

@@ -1,5 +1,7 @@
import subprocess
import os
import csv
import sys
def subprocess_run(cmd):
@@ -32,3 +34,19 @@ class LanguageConfig:
self.capitalized_lang = capitalized_lang
self.ext = ext
self.ql_path = ql_path
def read_cwes(path):
cwes = {}
with open(path) as csvfile:
reader = csv.reader(csvfile)
next(reader)
for row in reader:
# row: CWE-89,sql,SQL injection
cwe = row[0]
if cwe not in cwes:
cwes[cwe] = {
"sink": row[1],
"label": row[2]
}
return cwes