mrvacommander/client/qldbtools/bin/mc-db-refine-info

#!/usr/bin/env python
""" Read an initial table of CodeQL DB information, produced by
    mc-db-initial-info, and collect more detailed information from the database
    files.  Write out an extended table in CSV format.
"""
import qldbtools.utils as utils
import argparse
import logging
import pandas as pd
import sys

#
#* Configure logger
#
logging.basicConfig(format='%(asctime)s %(message)s')

#
#* Process command line
#
parser = argparse.ArgumentParser(
    description="""Read an initial table of CodeQL DB information, produced by
    mc-db-initial-info, and collect more detailed information from the database
    files.  Write out an extended table in CSV format. """)
args = parser.parse_args()

#
#* Collect the information
#
d = pd.read_csv(sys.stdin)
joiners = []
for left_index in range(0, len(d)-1):
    try:
        cqlc, metac = utils.extract_metadata(d.path[left_index])
    except utils.ExtractNotZipfile:
        continue
    except utils.ExtractNoCQLDB:
        continue
    try:
        detail_df = utils.metadata_details(left_index, cqlc, metac)
    except utils.DetailsMissing:
        continue
    joiners.append(detail_df)
joiners_df = pd.concat(joiners, axis=0)
full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')

#** Add single uniqueness field -- CID (Cumulative ID)
full_df['CID'] = full_df.apply(lambda row:
                               utils.cid_hash( (row['creationTime'],
                                                row['sha'],
                                                row['cliVersion'],
                                                row['language'])
                                              ), axis=1)

#** Re-order the dataframe columns by importance
# - Much of the data
#   1. Is only conditionally present
#   2. Is extra info, not for the DB proper
#   3. May have various names
#
# - The essential columns are
#     | owner               |
#     | name                |
#     | language            |
#     | size                |
#     | cliVersion          |
#     | creationTime        |
#     | sha                 |
#     | baselineLinesOfCode |
#     | path                |
#
# - The rest are useful; put them last
#     | db_lang             |
#     | db_lang_displayName |
#     | db_lang_file_count  |
#     | db_lang_linesOfCode |
#     | left_index          |
#     | ctime               |
#     | primaryLanguage     |
#     | finalised           |

final_df = full_df.reindex( columns=['owner', 'name', 'cliVersion',
                                     'creationTime', 'language', 'sha','CID',
                                     'baselineLinesOfCode', 'path', 'db_lang',
                                     'db_lang_displayName', 'db_lang_file_count',
                                     'db_lang_linesOfCode', 'ctime',
                                     'primaryLanguage', 'finalised', 'left_index',
                                     'size'])

final_df.to_csv(sys.stdout, index=False)

# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End: