From dd776e312ae1a8b14a90567d608adaa949c133b9 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Tue, 19 Nov 2024 15:24:41 -0800 Subject: [PATCH] Add type information --- client/qldbtools/bin/mc-db-generate-selection | 15 ++++++++----- client/qldbtools/bin/mc-db-initial-info | 16 ++++++++++---- client/qldbtools/bin/mc-db-refine-info | 19 +++++++++++----- client/qldbtools/bin/mc-db-unique | 18 +++++++++------ client/qldbtools/qldbtools/utils.py | 22 +++++++++++-------- 5 files changed, 59 insertions(+), 31 deletions(-) diff --git a/client/qldbtools/bin/mc-db-generate-selection b/client/qldbtools/bin/mc-db-generate-selection index 009f7f2..d2064bc 100755 --- a/client/qldbtools/bin/mc-db-generate-selection +++ b/client/qldbtools/bin/mc-db-generate-selection @@ -6,6 +6,11 @@ """ import argparse import logging +from argparse import Namespace +from typing import List + +from pandas import DataFrame + import qldbtools.utils as utils import numpy as np @@ -41,18 +46,18 @@ parser.add_argument('-l', '--list-name', type=str, help='Name of the repository list', default='mirva-list') -args = parser.parse_args() +args: Namespace = parser.parse_args() # #* Load the information # import pandas as pd import sys -df0 = pd.read_csv(sys.stdin) +df0: DataFrame = pd.read_csv(sys.stdin) if args.num_entries == None: # Use all entries - df1 = df0 + df1: DataFrame = df0 else: # Use num_entries, chosen via pseudo-random numbers df1 = df0.sample(n=args.num_entries, @@ -61,12 +66,12 @@ else: # #* Form and save structures # -repos = [] +repos: list[str] = [] for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows(): owner, name, CID, path = row repos.append(utils.form_db_req_name(owner, name, CID)) -repo_list_name = args.list_name +repo_list_name: str = args.list_name vsc = { "version": 1, "databases": { diff --git a/client/qldbtools/bin/mc-db-initial-info b/client/qldbtools/bin/mc-db-initial-info index 793d233..280aeaf 100755 --- a/client/qldbtools/bin/mc-db-initial-info +++ b/client/qldbtools/bin/mc-db-initial-info @@ -2,11 +2,19 @@ """ Collect information about CodeQL databases from the file system and write out a table in CSV format. """ +from argparse import ArgumentParser +from typing import List + +from pandas import DataFrame + import qldbtools.utils as utils import argparse import logging import sys import pandas as pd + +from qldbtools.utils import DBInfo + # #* Configure logger # @@ -15,7 +23,7 @@ logging.basicConfig(format='%(asctime)s %(message)s') # #* Process command line # -parser = argparse.ArgumentParser( +parser: ArgumentParser = argparse.ArgumentParser( description="""Find all CodeQL DBs in and below starting_dir and export a CSV file with relevant data.""") parser.add_argument('starting_dir', type=str, @@ -26,9 +34,9 @@ args = parser.parse_args() #* Collect info # # Get the db information in list of DBInfo form -db_base = args.starting_dir -dbs = list(utils.collect_dbs(db_base)) -dbdf = pd.DataFrame([d.__dict__ for d in dbs]) +db_base: str = args.starting_dir +dbs: list[DBInfo] = list(utils.collect_dbs(db_base)) +dbdf: DataFrame = pd.DataFrame([d.__dict__ for d in dbs]) # # #* Write info out diff --git a/client/qldbtools/bin/mc-db-refine-info b/client/qldbtools/bin/mc-db-refine-info index 0202356..7f4449e 100755 --- a/client/qldbtools/bin/mc-db-refine-info +++ b/client/qldbtools/bin/mc-db-refine-info @@ -3,6 +3,11 @@ mc-db-initial-info, and collect more detailed information from the database files. Write out an extended table in CSV format. """ +from argparse import ArgumentParser +from typing import List + +from pandas import DataFrame + import qldbtools.utils as utils import argparse import logging @@ -17,7 +22,7 @@ logging.basicConfig(format='%(asctime)s %(message)s') # #* Process command line # -parser = argparse.ArgumentParser( +parser: ArgumentParser = argparse.ArgumentParser( description="""Read an initial table of CodeQL DB information, produced by mc-db-initial-info, and collect more detailed information from the database files. Write out an extended table in CSV format. """) @@ -26,22 +31,24 @@ args = parser.parse_args() # #* Collect the information # This step is time-intensive so we save the results right after. -d = pd.read_csv(sys.stdin) -joiners = [] +d: DataFrame = pd.read_csv(sys.stdin) +joiners: list[DataFrame] = [] for left_index in range(0, len(d)-1): try: + metac: object + cqlc: object cqlc, metac = utils.extract_metadata(d.path[left_index]) except utils.ExtractNotZipfile: continue except utils.ExtractNoCQLDB: continue try: - detail_df = utils.metadata_details(left_index, cqlc, metac) + detail_df: DataFrame = utils.metadata_details(left_index, cqlc, metac) except utils.DetailsMissing: continue joiners.append(detail_df) -joiners_df = pd.concat(joiners, axis=0) -full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer') +joiners_df: DataFrame = pd.concat(joiners, axis=0) +full_df: DataFrame = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer') # #* Save results diff --git a/client/qldbtools/bin/mc-db-unique b/client/qldbtools/bin/mc-db-unique index fb974b5..df62438 100755 --- a/client/qldbtools/bin/mc-db-unique +++ b/client/qldbtools/bin/mc-db-unique @@ -18,6 +18,10 @@ """ import argparse import logging +from argparse import Namespace +from typing import Any + +from pandas import DataFrame, Series # #* Configure logger @@ -39,7 +43,7 @@ parser = argparse.ArgumentParser( parser.add_argument('language', type=str, help='The language to be analyzed.') -args = parser.parse_args() +args: Namespace = parser.parse_args() # #* Collect the information and select subset # @@ -47,7 +51,7 @@ import pandas as pd import sys import qldbtools.utils as utils -df2 = pd.read_csv(sys.stdin) +df2: DataFrame = pd.read_csv(sys.stdin) # #* Add single uniqueness field -- CID (Cumulative ID) @@ -88,7 +92,7 @@ df2['CID'] = df2.apply(lambda row: # | primaryLanguage | # | finalised | -df3 = df2.reindex( columns=['owner', 'name', 'cliVersion', 'creationTime', +df3: DataFrame = df2.reindex( columns=['owner', 'name', 'cliVersion', 'creationTime', 'language', 'sha','CID', 'baselineLinesOfCode', 'path', 'db_lang', 'db_lang_displayName', 'db_lang_file_count', @@ -101,14 +105,14 @@ rows = ( df3['cliVersion'].isna() | df3['creationTime'].isna() | df3['language'].isna() | df3['sha'].isna() ) -df4 = df3[~rows] +df4: DataFrame = df3[~rows] -# XX: Limit to one language +# Limit to one language df5 = df4[df4['language'] == args.language] # Sort and group -df_sorted = df5.sort_values(by=['owner', 'name', 'CID', 'creationTime']) -df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index() +df_sorted: DataFrame = df5.sort_values(by=['owner', 'name', 'CID', 'creationTime']) +df_unique: DataFrame = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index() # Write output df_unique.to_csv(sys.stdout, index=False) diff --git a/client/qldbtools/qldbtools/utils.py b/client/qldbtools/qldbtools/utils.py index d9cdad1..8056edc 100644 --- a/client/qldbtools/qldbtools/utils.py +++ b/client/qldbtools/qldbtools/utils.py @@ -9,11 +9,15 @@ import datetime import json import logging import os +from typing import List, Dict, Any + import pandas as pd import time import yaml import zipfile +from pandas import DataFrame + #* Setup logging.basicConfig( level=logging.DEBUG, @@ -30,7 +34,7 @@ def log_and_raise_e(message, exception): logging.error(message) raise exception(message) -def traverse_tree(root): +def traverse_tree(root: str) -> Path: root_path = Path(os.path.expanduser(root)) if not root_path.exists() or not root_path.is_dir(): log_and_raise(f"The specified root path '{root}' does not exist or " @@ -51,7 +55,7 @@ class DBInfo: size : int = 63083064 -def collect_dbs(db_base): +def collect_dbs(db_base: str) -> DBInfo: for path in traverse_tree(db_base): if path.name == "db.zip": # For the current repository, we have @@ -69,7 +73,7 @@ def collect_dbs(db_base): yield db -def extract_metadata(zipfile_path): +def extract_metadata(zipfile_path: str) -> tuple[object,object]: """ extract_metadata(zipfile) @@ -111,7 +115,7 @@ def extract_metadata(zipfile_path): class ExtractNotZipfile(Exception): pass class ExtractNoCQLDB(Exception): pass -def metadata_details(left_index, codeql_content, meta_content): +def metadata_details(left_index: int, codeql_content: object, meta_content: object) -> pd.DataFrame: """ metadata_details(codeql_content, meta_content) @@ -143,11 +147,11 @@ def metadata_details(left_index, codeql_content, meta_content): 'finalised': cqlc.get('finalised', pd.NA), } f = pd.DataFrame(d, index=[0]) - joiners = [] + joiners: list[dict[str, int | Any]] = [] if not ('languages' in metac): log_and_raise_e("Missing 'languages' in metadata", DetailsMissing) for lang, lang_cont in metac['languages'].items(): - d1 = { 'left_index' : left_index, + d1: dict[str, int | Any] = { 'left_index' : left_index, 'db_lang': lang } for prop, val in lang_cont.items(): if prop == 'files': @@ -157,8 +161,8 @@ def metadata_details(left_index, codeql_content, meta_content): elif prop == 'displayName': d1['db_lang_displayName'] = val joiners.append(d1) - fj = pd.DataFrame(joiners) - full_df = pd.merge(f, fj, on='left_index', how='outer') + fj: DataFrame = pd.DataFrame(joiners) + full_df: DataFrame = pd.merge(f, fj, on='left_index', how='outer') return full_df class DetailsMissing(Exception): pass @@ -185,7 +189,7 @@ def form_db_bucket_name(owner, name, CID): """ return f'{owner}${name}ctsj{CID}.zip' -def form_db_req_name(owner, name, CID): +def form_db_req_name(owner: str, name: str, CID: str) -> str: """ form_db_req_name(owner, name, CID) Return the name to use in mrva requests; this function is trivial and used to