Files
codeql/misc/scripts/library-coverage/generate-timeseries.py

170 lines
5.9 KiB
Python

import subprocess
import csv
import sys
import os
import shutil
from datetime import date
import datetime
import utils
import settings
import packages as pack
import frameworks as fr
"""
Gets the sink/source/summary statistics for different days.
"""
# the distance between commits to include in the output
day_distance = 1
class Git:
def get_output(arr):
r = subprocess.check_output(arr, text=True, env=os.environ.copy())
return r.strip("\n'")
def get_date(sha):
d = Git.get_output(
["git", "show", "--no-patch", "--no-notes", "--pretty='%cd'", "--date=short", sha])
return date.fromisoformat(d)
def get_parent(sha):
parent_sha = Git.get_output(
["git", "rev-parse", sha + "^"])
parent_date = Git.get_date(parent_sha)
return (parent_sha, parent_date)
def get_previous_sha(sha, date):
parent_sha, parent_date = Git.get_parent(sha)
while parent_date > date + datetime.timedelta(days=-1 * day_distance):
parent_sha, parent_date = Git.get_parent(parent_sha)
return (parent_sha, parent_date)
def get_packages(lang, query):
try:
db = "empty_" + lang
ql_output = "output-" + lang + ".csv"
if os.path.isdir(db):
shutil.rmtree(db)
utils.create_empty_database(lang, ".java", db)
utils.run_codeql_query(query, db, ql_output)
return pack.PackageCollection(ql_output)
except:
print("Unexpected error:", sys.exc_info()[0])
raise Exception()
finally:
if os.path.isfile(ql_output):
os.remove(ql_output)
if os.path.isdir(db):
shutil.rmtree(db)
working_dir = ""
if len(sys.argv) > 1:
working_dir = sys.argv[1]
configs = [
utils.LanguageConfig(
"java", "Java", ".java", "java/ql/src/meta/frameworks/Coverage.ql")
]
# todo: change this when we cover multiple languages. We should compute the SHAs
# only once and not per language
for config in configs:
with open("timeseries-" + config.lang + ".csv", 'w', newline='') as csvfile_total:
with open("timeseries-" + config.lang + "-packages.csv", 'w', newline='') as csvfile_packages:
csvwriter_total = csv.writer(csvfile_total)
csvwriter_packages = csv.writer(csvfile_packages)
csvwriter_total.writerow(
["SHA", "Date", "Sources", "Sinks", "Summaries"])
csvwriter_packages.writerow(
["SHA", "Date", "Framework", "Package", "Sources", "Sinks", "Summaries"])
os.chdir(working_dir)
utils.subprocess_run(["git", "checkout", "main"])
current_sha = Git.get_output(["git", "rev-parse", "HEAD"])
current_date = Git.get_date(current_sha)
# Read the additional framework data, such as URL, friendly name from the latest commit
input_framework_csv = settings.documentation_folder_no_prefix + "frameworks.csv"
frameworks = fr.FrameworkCollection(
input_framework_csv.format(language=config.lang))
while True:
print("Getting stats for " + current_sha)
utils.subprocess_run(["git", "checkout", current_sha])
try:
packages = get_packages(config.lang, config.ql_path)
csvwriter_total.writerow([
current_sha,
current_date,
packages.get_part_count("source"),
packages.get_part_count("sink"),
packages.get_part_count("summary")])
matched_packages = set()
for framework in frameworks.get_frameworks():
framework: fr.Framework = framework
row = [current_sha, current_date,
framework.name, framework.package_pattern]
sources = 0
sinks = 0
summaries = 0
for package in packages.get_packages():
if frameworks.get_package_filter(framework)(package):
sources += package.get_part_count("source")
sinks += package.get_part_count("sink")
summaries += package.get_part_count("summary")
matched_packages.add(package.name)
row.append(sources)
row.append(sinks)
row.append(summaries)
csvwriter_packages.writerow(row)
row = [current_sha, current_date, "Others"]
sources = 0
sinks = 0
summaries = 0
other_packages = set()
for package in packages.get_packages():
if not package.name in matched_packages:
sources += package.get_part_count("source")
sinks += package.get_part_count("sink")
summaries += package.get_part_count("summary")
other_packages.add(package.name)
row.append(", ".join(sorted(other_packages)))
row.append(sources)
row.append(sinks)
row.append(summaries)
csvwriter_packages.writerow(row)
print("Collected stats for " + current_sha +
" at " + current_date.isoformat())
except:
print("Error getting stats for " +
current_sha + ". Stopping iteration.")
break
current_sha, current_date = Git.get_previous_sha(
current_sha, current_date)
utils.subprocess_run(["git", "checkout", "main"])