#!/usr/bin/env python """ Read a table of CodeQL DB information and produce a table with unique entries adding the Cumulative ID (CID) column. To make this happen: - Group entries by (owner,name,CID), sort each group by creationTime, and keep only the top (newest) element. - Drop rows that don't have the | cliVersion | | creationTime | | language | | sha | columns. There are very few (16 out of 6000 on recent tests) and their DBs are quesionable. """ import argparse import logging # #* Configure logger # logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') # Overwrite log level set by minio root_logger = logging.getLogger() root_logger.setLevel(logging.INFO) # #* Process command line # parser = argparse.ArgumentParser( description=""" Read a table of CodeQL DB information, narrow to , group entries by (owner,name), sort each group by creationTime and keep only the top (newest) element. """) parser.add_argument('language', type=str, help='The language to be analyzed.') args = parser.parse_args() # #* Collect the information and select subset # import pandas as pd import sys import qldbtools.utils as utils df2 = pd.read_csv(sys.stdin) # #* Add single uniqueness field -- CID (Cumulative ID) # df2['CID'] = df2.apply(lambda row: utils.cid_hash(( row['cliVersion'], row['creationTime'], row['language'], row['sha'], )), axis=1) # #* Re-order the dataframe columns by importance # - Much of the data # 1. Is only conditionally present # 2. Is extra info, not for the DB proper # 3. May have various names # # - The essential columns are # | owner | # | name | # | language | # | size | # | cliVersion | # | creationTime | # | sha | # | baselineLinesOfCode | # | path | # # - The rest are useful; put them last # | db_lang | # | db_lang_displayName | # | db_lang_file_count | # | db_lang_linesOfCode | # | left_index | # | ctime | # | primaryLanguage | # | finalised | df3 = df2.reindex( columns=['owner', 'name', 'cliVersion', 'creationTime', 'language', 'sha','CID', 'baselineLinesOfCode', 'path', 'db_lang', 'db_lang_displayName', 'db_lang_file_count', 'db_lang_linesOfCode', 'ctime', 'primaryLanguage', 'finalised', 'left_index', 'size']) # Identify rows missing specific entries rows = ( df3['cliVersion'].isna() | df3['creationTime'].isna() | df3['language'].isna() | df3['sha'].isna() ) df4 = df3[~rows] # XX: Limit to one language df5 = df4[df4['language'] == args.language] # Sort and group df_sorted = df5.sort_values(by=['owner', 'name', 'CID', 'creationTime']) df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index() # Write output df_unique.to_csv(sys.stdout, index=False) # Local Variables: # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" # End: