#!/usr/bin/env python """ Read an initial table of CodeQL DB information, produced by mc-db-initial-info, and collect more detailed information from the database files. Write out an extended table in CSV format. """ import qldbtools.utils as utils import argparse import logging import pandas as pd import sys # #* Configure logger # logging.basicConfig(format='%(asctime)s %(message)s') # #* Process command line # parser = argparse.ArgumentParser( description="""Read an initial table of CodeQL DB information, produced by mc-db-initial-info, and collect more detailed information from the database files. Write out an extended table in CSV format. """) args = parser.parse_args() # #* Collect the information # d = pd.read_csv(sys.stdin) joiners = [] for left_index in range(0, len(d)-1): try: cqlc, metac = utils.extract_metadata(d.path[left_index]) except utils.ExtractNotZipfile: continue except utils.ExtractNoCQLDB: continue try: detail_df = utils.metadata_details(left_index, cqlc, metac) except utils.DetailsMissing: continue joiners.append(detail_df) joiners_df = pd.concat(joiners, axis=0) full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer') #** Add single uniqueness field -- CID (Cumulative ID) full_df['CID'] = full_df.apply(lambda row: utils.cid_hash( (row['creationTime'], row['sha'], row['cliVersion'], row['language']) ), axis=1) #** Re-order the dataframe columns by importance # - Much of the data # 1. Is only conditionally present # 2. Is extra info, not for the DB proper # 3. May have various names # # - The essential columns are # | owner | # | name | # | language | # | size | # | cliVersion | # | creationTime | # | sha | # | baselineLinesOfCode | # | path | # # - The rest are useful; put them last # | db_lang | # | db_lang_displayName | # | db_lang_file_count | # | db_lang_linesOfCode | # | left_index | # | ctime | # | primaryLanguage | # | finalised | final_df = full_df.reindex( columns=['owner', 'name', 'cliVersion', 'creationTime', 'language', 'sha','CID', 'baselineLinesOfCode', 'path', 'db_lang', 'db_lang_displayName', 'db_lang_file_count', 'db_lang_linesOfCode', 'ctime', 'primaryLanguage', 'finalised', 'left_index', 'size']) final_df.to_csv(sys.stdout, index=False) # Local Variables: # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" # End: