mrvacommander/client/qldbtools/bin/mc-db-unique

#!/usr/bin/env python
""" Read a table of CodeQL DB information and produce a table with unique entries
    adding the Cumulative ID (CID) column.

    To make this happen:
    - Group entries by (owner,name,CID),
      sort each group by creationTime,
      and keep only the top (newest) element.

    - Drop rows that don't have the
          | cliVersion   |
          | creationTime |
          | language     |
          | sha          |
      columns.  There are very few (16 out of 6000 on recent tests) and their DBs
      are quesionable.

"""
import argparse
import logging

#
#* Configure logger
#
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
# Overwrite log level set by minio
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)

#
#* Process command line
#
parser = argparse.ArgumentParser(
    description=""" Read a table of CodeQL DB information,
    narrow to <language>,
    group entries by (owner,name),  sort each group by
    creationTime and keep only the top (newest) element.
    """)
parser.add_argument('language', type=str,
                    help='The language to be analyzed.')

args = parser.parse_args()
#
#* Collect the information and select subset
#
import pandas as pd
import sys
import qldbtools.utils as utils

df2 = pd.read_csv(sys.stdin)

#
#* Add single uniqueness field -- CID (Cumulative ID)
#
df2['CID'] = df2.apply(lambda row:
                       utils.cid_hash((
                           row['cliVersion'],
                           row['creationTime'],
                           row['language'],
                           row['sha'],
                       )), axis=1)

#
#* Re-order the dataframe columns by importance
# - Much of the data
#   1. Is only conditionally present
#   2. Is extra info, not for the DB proper
#   3. May have various names
#
# - The essential columns are
#     | owner               |
#     | name                |
#     | language            |
#     | size                |
#     | cliVersion          |
#     | creationTime        |
#     | sha                 |
#     | baselineLinesOfCode |
#     | path                |
#
# - The rest are useful; put them last
#     | db_lang             |
#     | db_lang_displayName |
#     | db_lang_file_count  |
#     | db_lang_linesOfCode |
#     | left_index          |
#     | ctime               |
#     | primaryLanguage     |
#     | finalised           |

df3 = df2.reindex( columns=['owner', 'name', 'cliVersion', 'creationTime',
                                     'language', 'sha','CID',
                                     'baselineLinesOfCode', 'path', 'db_lang',
                                     'db_lang_displayName', 'db_lang_file_count',
                                     'db_lang_linesOfCode', 'ctime',
                                     'primaryLanguage', 'finalised', 'left_index',
                                     'size'])

# Identify rows missing specific entries
rows = ( df3['cliVersion'].isna() |
         df3['creationTime'].isna() |
         df3['language'].isna() |
         df3['sha'].isna() )
df4 = df3[~rows]

# XX: Limit to one language
df5 = df4[df4['language'] == args.language]

# Sort and group
df_sorted = df5.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()

# Write output
df_unique.to_csv(sys.stdout, index=False)

# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End: