Rework CSV report generator and change timeseries report to use framework.csv

2025-12-17 01:03:14 +01:00 · 2021-06-08 14:51:49 +02:00
parent c6cb7c6eed
commit ba9c2e0702
5 changed files with 323 additions and 160 deletions
--- a/misc/scripts/library-coverage/frameworks.py
+++ b/misc/scripts/library-coverage/frameworks.py
@@ -0,0 +1,72 @@
+import csv
+import sys
+import packages
+
+
+class Framework:
+    """
+    Frameworks are the aggregation units in the RST and timeseries report. These are read from the frameworks.csv file.
+    """
+
+    def __init__(self, name, url, package_pattern):
+        self.name = name
+        self.url = url
+        self.package_pattern = package_pattern
+
+
+class FrameworkCollection:
+    """
+    A (sorted) list of frameworks.
+    """
+
+    def __init__(self, path):
+        self.frameworks: list[Framework] = []
+        self.package_patterns = set()
+
+        with open(path) as csvfile:
+            reader = csv.reader(csvfile)
+            next(reader)
+            for row in reader:
+                # row: Hibernate,https://hibernate.org/,org.hibernate
+                self.__add(Framework(row[0], row[1], row[2]))
+        self.__sort()
+
+    def __add(self, framework: Framework):
+        if framework.package_pattern not in self.package_patterns:
+            self.package_patterns.add(framework.package_pattern)
+            self.frameworks.append(framework)
+        else:
+            print("Package pattern already exists: " +
+                  framework.package_pattern, file=sys.stderr)
+
+    def __sort(self):
+        self.frameworks.sort(key=lambda f: f.name)
+
+    def get(self, framework_name):
+        for framework in self.frameworks:
+            if framework.name == framework_name:
+                return framework
+        return None
+
+    def get_patterns(self):
+        return self.package_patterns
+
+    def get_frameworks(self):
+        return self.frameworks
+
+    def __package_match(self, package: packages.Package, pattern):
+        return (pattern.endswith("*") and package.name.startswith(pattern[:-1])) or (not pattern.endswith("*") and pattern == package.name)
+
+    def get_package_filter(self, framework: Framework):
+        """
+        Returns a lambda filter that holds for packages that match the current framework.
+
+        The pattern is either full name, such as "org.hibernate", or a prefix, such as "java.*"
+        Package patterns might overlap, in case of 'org.apache.commons.io' and 'org.apache.*', the statistics for
+        the latter will not include the statistics for the former.
+        """
+        return lambda p: \
+            self.__package_match(p, framework.package_pattern) and \
+            all(
+                len(framework.package_pattern) >= len(pattern) or
+                not self.__package_match(p, pattern) for pattern in self.get_patterns())
--- a/misc/scripts/library-coverage/generate-report.py
+++ b/misc/scripts/library-coverage/generate-report.py
@@ -4,6 +4,8 @@ import os
 import shutil
 import settings
 import utils
+import packages as pack
+import frameworks as fr

 """
 This script runs the CSV coverage report QL query, and transforms it to a more readable format.
@@ -28,14 +30,7 @@ def append_csv_dict_item(list, dictionary, key):
        list.append(None)


-def increment_dict_item(value, dictionary, key):
-    """Increments the value of the dictionary[key] by value."""
-    if key not in dictionary:
-        dictionary[key] = 0
-    dictionary[key] += int(value)
-
-
-def collect_package_stats(packages, cwes, filter):
+def collect_package_stats(packages: pack.PackageCollection, cwes, filter):
    """
    Collects coverage statistics for packages matching the given filter. `filter` is a `lambda` that for example (i) matches
    packages to frameworks, or (2) matches packages that were previously not processed.
@@ -48,20 +43,21 @@ def collect_package_stats(packages, cwes, filter):
    framework_cwes = {}
    processed_packages = set()

-    for package in packages:
+    for package in packages.get_packages():
+        package: pack.Package = package
        if filter(package):
            processed_packages.add(package)
-            sources += int(packages[package]["kind"].get("source:remote", 0))
-            steps += int(packages[package]["part"].get("summary", 0))
-            sinks += int(packages[package]["part"].get("sink", 0))
+            sources += package.get_kind_count("source:remote")
+            steps += package.get_part_count("summary")
+            sinks += package.get_part_count("sink")

            for cwe in cwes:
                sink = "sink:" + cwes[cwe]["sink"]
-                if sink in packages[package]["kind"]:
+                count = package.get_kind_count(sink)
+                if count > 0:
                    if cwe not in framework_cwes:
                        framework_cwes[cwe] = 0
-                    framework_cwes[cwe] += int(
-                        packages[package]["kind"][sink])
+                    framework_cwes[cwe] += count

    return sources, steps, sinks, framework_cwes, processed_packages

@@ -137,37 +133,12 @@ for config in configs:
    utils.run_codeql_query(config.ql_path, db, ql_output)
    shutil.rmtree(db)

-    packages = {}
-    parts = set()
-    kinds = set()
-
-    # Read the generated CSV file, and collect package statistics.
-    with open(ql_output) as csvfile:
-        reader = csv.reader(csvfile)
-        for row in reader:
-            # row: "android.util",1,"remote","source",16
-            package = row[0]
-            if package not in packages:
-                packages[package] = {
-                    "count": row[1],
-                    # part: "summary", "sink", or "source"
-                    "part": {},
-                    # kind: "source:remote", "sink:create-file", ...
-                    "kind": {}
-                }
-
-            part = row[3]
-            parts.add(part)
-            increment_dict_item(row[4], packages[package]["part"], part)
-
-            kind = part + ":" + row[2]
-            kinds.add(kind)
-            increment_dict_item(row[4], packages[package]["kind"], kind)
+    packages = pack.PackageCollection(ql_output)

    os.remove(ql_output)

-    parts = sorted(parts)
-    kinds = sorted(kinds)
+    parts = packages.get_parts()
+    kinds = packages.get_kinds()

    # Write the denormalized package statistics to a CSV file.
    with open(output_csv.format(language=lang), 'w', newline='') as csvfile:
@@ -179,44 +150,21 @@ for config in configs:

        csvwriter.writerow(headers)

-        for package in sorted(packages):
-            row = [package]
+        for package in packages.get_packages():
+            package: pack.Package = package
+            row = [package.name]
            for part in parts:
-                append_csv_dict_item(row, packages[package]["part"], part)
+                append_csv_number(row, package.get_part_count(part))
            for kind in kinds:
-                append_csv_dict_item(row, packages[package]["kind"], kind)
+                append_csv_number(row, package.get_kind_count(kind))
            csvwriter.writerow(row)

    # Read the additional framework data, such as URL, friendly name
-    frameworks = {}
-
-    with open(input_framework_csv.format(language=lang)) as csvfile:
-        reader = csv.reader(csvfile)
-        next(reader)
-        for row in reader:
-            # row: Hibernate,https://hibernate.org/,org.hibernate
-            framwork = row[0]
-            if framwork not in frameworks:
-                frameworks[framwork] = {
-                    "package": row[2],
-                    "url": row[1]
-                }
+    frameworks = fr.FrameworkCollection(
+        input_framework_csv.format(language=lang))

    # Read the additional CWE data
-    cwes = {}
-
-    with open(input_cwe_sink_csv.format(language=lang)) as csvfile:
-        reader = csv.reader(csvfile)
-        next(reader)
-        for row in reader:
-            # row: CWE-89,sql,SQL injection
-            cwe = row[0]
-            if cwe not in cwes:
-                cwes[cwe] = {
-                    "sink": row[1],
-                    "label": row[2]
-                }
-
+    cwes = utils.read_cwes(input_cwe_sink_csv.format(language=lang))
    sorted_cwes = sorted(cwes)

    with open(output_rst.format(language=lang), 'w', newline='') as rst_file:
@@ -246,34 +194,24 @@ for config in configs:

        processed_packages = set()

-        all_package_patterns = set(
-            (frameworks[fr]["package"] for fr in frameworks))
-
        # Write a row for each framework.
-        for framework in sorted(frameworks):
+        for framework in frameworks.get_frameworks():
+            framework: fr.Framework = framework
            row = []

            # Add the framework name to the row
-            if not frameworks[framework]["url"]:
-                row.append(row_prefix + framework)
+            if not framework.url:
+                row.append(row_prefix + framework.name)
            else:
                row.append(
-                    row_prefix + "`" + framework + " <" + frameworks[framework]["url"] + ">`_")
+                    row_prefix + "`" + framework.name + " <" + framework.url + ">`_")

            # Add the package name to the row
-            row.append("``" + frameworks[framework]["package"] + "``")
-
-            current_package_pattern = frameworks[framework]["package"]
+            row.append("``" + framework.package_pattern + "``")

            # Collect statistics on the current framework
-            # current_package_pattern is either full name, such as "org.hibernate", or a prefix, such as "java.*"
-            # Package patterns might overlap, in case of 'org.apache.commons.io' and 'org.apache.*', the statistics for
-            # the latter will not include the statistics for the former.
-            def package_match(package_name, pattern): return (pattern.endswith(
-                "*") and package_name.startswith(pattern[:-1])) or (not pattern.endswith("*") and pattern == package_name)
-
            def collect_framework(): return collect_package_stats(
-                packages, cwes, lambda p: package_match(p, current_package_pattern) and all(len(current_package_pattern) >= len(pattern) or not package_match(p, pattern) for pattern in all_package_patterns))
+                packages, cwes, frameworks.get_package_filter(framework))

            row, f_processed_packages = add_package_stats_to_row(
                row, sorted_cwes, collect_framework)
@@ -290,8 +228,8 @@ for config in configs:
        row, other_packages = add_package_stats_to_row(
            row, sorted_cwes, collect_others)

-        row[1] = ", ".join("``{0}``".format(p)
-                           for p in sorted(other_packages))
+        row[1] = ", ".join("``{0}``".format(p.name)
+                           for p in sorted(other_packages, key=lambda x: x.name))

        csvwriter.writerow(row)

--- a/misc/scripts/library-coverage/generate-timeseries.py
+++ b/misc/scripts/library-coverage/generate-timeseries.py
@@ -6,42 +6,43 @@ import shutil
 from datetime import date
 import datetime
 import utils
+import settings
+import packages as pack
+import frameworks as fr

 """
-    Gets the sink/source/summary statistics for different days.
+Gets the sink/source/summary statistics for different days.
 """

 # the distance between commits to include in the output
 day_distance = 1


-def get_str_output(arr):
-    r = subprocess.check_output(arr)
-    return r.decode("utf-8").strip("\n'")
+class Git:
+    def get_output(arr):
+        r = subprocess.check_output(arr, text=True, env=os.environ.copy())
+        return r.strip("\n'")
+
+    def get_date(sha):
+        d = Git.get_output(
+            ["git", "show",  "--no-patch",  "--no-notes", "--pretty='%cd'",  "--date=short", sha])
+        return date.fromisoformat(d)
+
+    def get_parent(sha):
+        parent_sha = Git.get_output(
+            ["git", "rev-parse",  sha + "^"])
+        parent_date = Git.get_date(parent_sha)
+        return (parent_sha, parent_date)
+
+    def get_previous_sha(sha, date):
+        parent_sha, parent_date = Git.get_parent(sha)
+        while parent_date > date + datetime.timedelta(days=-1 * day_distance):
+            parent_sha, parent_date = Git.get_parent(parent_sha)
+
+        return (parent_sha, parent_date)


-def get_date(sha):
-    d = get_str_output(
-        ["git", "show",  "--no-patch",  "--no-notes", "--pretty='%cd'",  "--date=short", sha])
-    return date.fromisoformat(d)
-
-
-def get_parent(sha, date):
-    parent_sha = get_str_output(
-        ["git", "rev-parse",  sha + "^"])
-    parent_date = get_date(parent_sha)
-    return (parent_sha, parent_date)
-
-
-def get_previous_sha(sha, date):
-    parent_sha, parent_date = get_parent(sha, date)
-    while parent_date > date + datetime.timedelta(days=-1 * day_distance):
-        parent_sha, parent_date = get_parent(parent_sha, parent_date)
-
-    return (parent_sha, parent_date)
-
-
-def get_stats(lang, query):
+def get_packages(lang, query):
    try:
        db = "empty_" + lang
        ql_output = "output-" + lang + ".csv"
@@ -50,41 +51,14 @@ def get_stats(lang, query):
        utils.create_empty_database(lang, ".java", db)
        utils.run_codeql_query(query, db, ql_output)

-        sources = 0
-        sinks = 0
-        summaries = 0
-
-        packages = {}
-
-        with open(ql_output) as csvfile:
-            reader = csv.reader(csvfile)
-            for row in reader:
-                # row: "android.util",1,"remote","source",16
-                package = row[0]
-                if package not in packages:
-                    packages[package] = {
-                        "sources": 0,
-                        "sinks": 0,
-                        "summaries": 0
-                    }
-
-                if row[3] == "source":
-                    sources += int(row[4])
-                    packages[package]["sources"] += int(row[4])
-                if row[3] == "sink":
-                    sinks += int(row[4])
-                    packages[package]["sinks"] += int(row[4])
-                if row[3] == "summary":
-                    summaries += int(row[4])
-                    packages[package]["summaries"] += int(row[4])
-
-        os.remove(ql_output)
-
-        return (sources, sinks, summaries, packages)
+        return pack.PackageCollection(ql_output)
    except:
        print("Unexpected error:", sys.exc_info()[0])
        raise Exception()
    finally:
+        if os.path.isfile(ql_output):
+            os.remove(ql_output)
+
        if os.path.isdir(db):
            shutil.rmtree(db)

@@ -108,28 +82,79 @@ for config in configs:
            csvwriter_total.writerow(
                ["SHA", "Date", "Sources", "Sinks", "Summaries"])
            csvwriter_packages.writerow(
-                ["SHA", "Date", "Package", "Sources", "Sinks", "Summaries"])
+                ["SHA", "Date", "Framework", "Package", "Sources", "Sinks", "Summaries"])

            os.chdir(working_dir)

            utils.subprocess_run(["git", "checkout", "main"])

-            current_sha = get_str_output(["git", "rev-parse", "HEAD"])
-            current_date = get_date(current_sha)
+            current_sha = Git.get_output(["git", "rev-parse", "HEAD"])
+            current_date = Git.get_date(current_sha)
+
+            # Read the additional framework data, such as URL, friendly name from the latest commit
+            input_framework_csv = settings.documentation_folder_no_prefix + "frameworks.csv"
+            frameworks = fr.FrameworkCollection(
+                input_framework_csv.format(language=config.lang))

            while True:
                print("Getting stats for " + current_sha)
                utils.subprocess_run(["git", "checkout", current_sha])

                try:
-                    stats = get_stats(config.lang, config.ql_path)
+                    packages = get_packages(config.lang, config.ql_path)

-                    csvwriter_total.writerow(
-                        [current_sha, current_date, stats[0], stats[1], stats[2]])
+                    csvwriter_total.writerow([
+                        current_sha,
+                        current_date,
+                        packages.get_part_count("source"),
+                        packages.get_part_count("sink"),
+                        packages.get_part_count("summary")])

-                    for package in stats[3]:
-                        csvwriter_packages.writerow(
-                            [current_sha, current_date, package, stats[3][package]["sources"], stats[3][package]["sinks"], stats[3][package]["summaries"]])
+                    matched_packages = set()
+
+                    for framework in frameworks.get_frameworks():
+                        framework: fr.Framework = framework
+
+                        row = [current_sha, current_date,
+                               framework.name, framework.package_pattern]
+
+                        sources = 0
+                        sinks = 0
+                        summaries = 0
+
+                        for package in packages.get_packages():
+                            if frameworks.get_package_filter(framework)(package):
+                                sources += package.get_part_count("source")
+                                sinks += package.get_part_count("sink")
+                                summaries += package.get_part_count("summary")
+                                matched_packages.add(package.name)
+
+                        row.append(sources)
+                        row.append(sinks)
+                        row.append(summaries)
+
+                        csvwriter_packages.writerow(row)
+
+                    row = [current_sha, current_date, "Others"]
+
+                    sources = 0
+                    sinks = 0
+                    summaries = 0
+                    other_packages = set()
+
+                    for package in packages.get_packages():
+                        if not package.name in matched_packages:
+                            sources += package.get_part_count("source")
+                            sinks += package.get_part_count("sink")
+                            summaries += package.get_part_count("summary")
+                            other_packages.add(package.name)
+
+                    row.append(", ".join(sorted(other_packages)))
+                    row.append(sources)
+                    row.append(sinks)
+                    row.append(summaries)
+
+                    csvwriter_packages.writerow(row)

                    print("Collected stats for " + current_sha +
                          " at " + current_date.isoformat())
@@ -138,7 +163,7 @@ for config in configs:
                          current_sha + ". Stopping iteration.")
                    break

-                current_sha, current_date = get_previous_sha(
+                current_sha, current_date = Git.get_previous_sha(
                    current_sha, current_date)

    utils.subprocess_run(["git", "checkout", "main"])
--- a/misc/scripts/library-coverage/packages.py
+++ b/misc/scripts/library-coverage/packages.py
@@ -0,0 +1,110 @@
+import csv
+
+
+class PackagePart:
+    """
+    Represents a single package part with its count returned from the QL query, such as:
+    "android.util",1,"remote","source",16
+    """
+
+    def __init__(self, package, kind, part, count):
+        self.package = package
+        # "summary", "sink", or "source"
+        self.part = part
+        # "source:remote", "sink:create-file", ...
+        self.kind = part + ":" + kind
+        self.count = int(count)
+
+
+class Package:
+    """
+    Represents an entire package with multiple parts returned from the QL query.
+    """
+
+    def __init__(self, name, package_count):
+        self.parts: list[PackagePart] = []
+
+        self.name = name
+        self.package_count = int(package_count)
+
+    def add_part(self, part: PackagePart):
+        self.parts.append(part)
+
+    def get_part_count(self, p):
+        count = 0
+        for part in self.parts:
+            if part.part == p:
+                count += part.count
+        return count
+
+    def get_kind_count(self, k):
+        count = 0
+        for part in self.parts:
+            if part.kind == k:
+                count += part.count
+        return count
+
+
+class PackageCollection:
+    """
+    A (sorted) list of packages. Packages are returned by the QL query in the form:
+    "android.util",1,"remote","source",16
+
+    And then the denormalized rows are aggregated by packages.
+    """
+
+    def __init__(self, ql_output_path):
+        self.packages: list[Package] = []
+        self.package_names = set()
+
+        # Read the generated CSV file, and collect package statistics.
+        with open(ql_output_path) as csvfile:
+            reader = csv.reader(csvfile)
+            for row in reader:
+                # row: "android.util",1,"remote","source",16
+
+                package = self.__get_or_create_package(row[0], row[1])
+
+                packagePart = PackagePart(
+                    package, row[2], row[3], row[4])
+
+                package.add_part(packagePart)
+        self.__sort()
+
+    def __sort(self):
+        self.packages.sort(key=lambda f: f.name)
+
+    def get_packages(self):
+        return self.packages
+
+    def __get_or_create_package(self, package_name, package_count):
+        if package_name not in self.package_names:
+            self.package_names.add(package_name)
+            package = Package(package_name, package_count)
+            self.packages.append(package)
+            return package
+        else:
+            for package in self.packages:
+                if package.name == package_name:
+                    return package
+            return None
+
+    def get_parts(self):
+        parts = set()
+        for package in self.packages:
+            for part in package.parts:
+                parts.add(part.part)
+        return sorted(parts)
+
+    def get_kinds(self):
+        kinds = set()
+        for package in self.packages:
+            for part in package.parts:
+                kinds.add(part.kind)
+        return sorted(kinds)
+
+    def get_part_count(self, p):
+        count = 0
+        for package in self.packages:
+            count += package.get_part_count(p)
+        return count
--- a/misc/scripts/library-coverage/utils.py
+++ b/misc/scripts/library-coverage/utils.py
@@ -1,5 +1,7 @@
 import subprocess
 import os
+import csv
+import sys


 def subprocess_run(cmd):
@@ -32,3 +34,19 @@ class LanguageConfig:
        self.capitalized_lang = capitalized_lang
        self.ext = ext
        self.ql_path = ql_path
+
+
+def read_cwes(path):
+    cwes = {}
+    with open(path) as csvfile:
+        reader = csv.reader(csvfile)
+        next(reader)
+        for row in reader:
+            # row: CWE-89,sql,SQL injection
+            cwe = row[0]
+            if cwe not in cwes:
+                cwes[cwe] = {
+                    "sink": row[1],
+                    "label": row[2]
+                }
+    return cwes